diff --git a/docs/lib/fast-array-onebuf.mjs b/docs/lib/fast-array-onebuf.mjs
new file mode 100644
index 0000000..5ed833c
--- /dev/null
+++ b/docs/lib/fast-array-onebuf.mjs
@@ -0,0 +1,340 @@
+// One-buffer PDFArray: every committed element lives in a single
+// append-only JS Array (arrayMain), kept for the document's lifetime.
+// Mirror of fast-dict-onebuf's strategy applied to PDFArray. Backing
+// is a plain heterogeneous JS Array -- slots hold the original
+// PDFObject references directly. No encoding, no decode on read; the
+// hot path is `arrayMain[start + i]`.
+//
+// Phase 3 of fast-dict-encoded did the same range-view refactor on
+// PDFArray but used a Float64Array + encoded slots (mirroring its
+// dict shape). The encoded backing cost ~300 ms of decodeValue
+// dispatch during save (PDFArray.copyBytesInto iterates ~500 k
+// elements). This shim keeps the heap win (~19 MB on the book by
+// removing each PDFArray's per-instance `this.array = []`) without
+// paying the decode cost: slots are JS references, reads are direct.
+//
+// 40-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
+//   bits  0-23: start  (24 bits, max 16 M slots in arrayMain)
+//   bits 24-39: length (16 bits, max 65 536 elements; max observed
+//                       ~25 k on the book)
+//   bits 40-52: spare (13 bits)
+//
+// Recursion. parseArray pushes elements onto a per-parser _arrayTemp;
+// inner parseArray invocations append on top, commit their frame to
+// arrayMain in one append, and pop temp back. Inner / outer ranges
+// in arrayMain do not overlap. _arrayTemp is independent of
+// fast-dict-onebuf's _dictTemp so dict <-> array recursion is fine.
+//
+// Mutations:
+//   - set(i, v): in-place replace (safe; no length change)
+//   - push(v) at HWM:    in-place extend (no other arrays follow)
+//   - push(v) not at HWM: COW the range to tail, then push
+//   - insert / remove:   always COW (shifts would corrupt neighbours)
+// Same at-HWM-determines-safety logic as fast-dict-onebuf; no owned
+// bit needed (see fast-dict-onebuf commit 7e8b1f7).
+//
+// Singleton PDFContext (one PDFDocument.load per process in our
+// pipeline). The singleton is duplicated rather than shared with
+// fast-dict-onebuf -- the mechanism is ten lines and keeping each
+// shim independently injectable is worth more than dedup'ing it.
+// Both shims end up holding references to the same PDFContext.
+//
+// Composes with --fast-dict-onebuf. Mutually exclusive with
+// --fast-dict-encoded (which subsumes both via its own encoded shape).
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFArray        = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// ---- The single buffer ---------------------------------------------
+
+// Pre-sized to total array slots + slack on the book. Other workloads
+// grow it naturally from this starting size. When the measure-pass
+// shim runs first, it calls setExpectedArraySlots() before parse,
+// which resizes `arrayMain` to exact measured demand via
+// `arrayMain.length = N`.
+const ARRAY_MAIN_INITIAL_CAP = 800000;
+const arrayMain = new Array(ARRAY_MAIN_INITIAL_CAP);
+let arrayMainLen = 0;
+
+export { arrayMain };
+export function getArrayMainLen() { return arrayMainLen; }
+
+// Resize arrayMain in place. Must be called before any parseArray /
+// withContext (i.e. while arrayMainLen is still 0). `slack` is a
+// multiplier on `slots`; default 1.0 (exact). Same in-place-resize
+// rationale as fast-dict-onebuf's setExpectedDictSlots: reassigning
+// the module-level binding invalidates V8's inline-cache slots in
+// every closure that reads it, and the deopt + recompile shows up as
+// a parse-time allocation spike.
+export function setExpectedArraySlots(slots, slack = 1.0) {
+  if (arrayMainLen > 0) {
+    throw new Error(
+      `fast-array-onebuf: setExpectedArraySlots called after parse started (arrayMainLen=${arrayMainLen})`,
+    );
+  }
+  arrayMain.length = Math.ceil(slots * slack);
+}
+
+// ---- Bit-packing helpers -------------------------------------------
+
+const POW_24 = 16777216;          // 2^24
+const MASK_24 = 0xFFFFFF;
+const MASK_16 = 0xFFFF;
+
+const MAX_START  = POW_24;          // exclusive
+const MAX_LENGTH = 1 << 16;         // 65 536, exclusive
+
+function pack(start, length) {
+  if (start  >= MAX_START)  throw new Error(`fast-array-onebuf: start ${start} exceeds 24-bit budget`);
+  if (length >= MAX_LENGTH) throw new Error(`fast-array-onebuf: length ${length} exceeds 16-bit budget`);
+  return start + length * POW_24;
+}
+
+function _start(d)  { return d & MASK_24; }
+function _length(d) { return Math.floor(d / POW_24) & MASK_16; }
+
+// ---- Singleton context ---------------------------------------------
+
+let _singletonContext = null;
+
+function _registerContext(ctx) {
+  if (_singletonContext === null) {
+    _singletonContext = ctx;
+  } else if (_singletonContext !== ctx) {
+    throw new Error('fast-array-onebuf: expected a singleton PDFContext, got a second distinct one.');
+  }
+}
+
+// ---- Append + COW helpers ------------------------------------------
+
+function _appendFromTemp(temp, fromOffset, lenSlots) {
+  for (let i = 0; i < lenSlots; i++) {
+    arrayMain[arrayMainLen + i] = temp[fromOffset + i];
+  }
+  arrayMainLen += lenSlots;
+}
+
+function _appendArray(arr) {
+  const len = arr.length;
+  for (let i = 0; i < len; i++) arrayMain[arrayMainLen + i] = arr[i];
+  arrayMainLen += len;
+}
+
+// COW: copy this array's range to arrayMain's tail. If already at
+// the HWM, nothing to copy -- return d unchanged.
+function _cow(pa) {
+  const d = pa.d;
+  const start = _start(d);
+  const length = _length(d);
+  if (start + length === arrayMainLen) return d;   // at HWM
+  const newStart = arrayMainLen;
+  for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
+  arrayMainLen += length;
+  return pack(newStart, length);
+}
+
+// ---- Construction --------------------------------------------------
+//
+// Use a plain-function constructor (`_FastArray`) with the prototype
+// aliased to PDFArray.prototype instead of `Object.create + writes`.
+// Same shape change fast-refs-class and fast-dict-onebuf made: V8
+// gives `new`-built instances a stable hidden class from the first
+// instance and drops per-instance cost vs the slow-property path
+// taken by Object.create + later property writes.
+//
+// No subclass dispatch needed -- PDFArray has no subclasses in
+// pdf-lib (unlike PDFDict's PDFCatalog / PDFPageTree / PDFPageLeaf).
+
+function _FastArray(d) { this.d = d; }
+_FastArray.prototype = PDFArray.prototype;
+
+function _makeFromRange(start, length, ctx) {
+  _registerContext(ctx);
+  return new _FastArray(pack(start, length));
+}
+
+function _makeFromAppend(arr, ctx) {
+  const start = arrayMainLen;
+  _appendArray(arr);
+  return _makeFromRange(start, arr.length, ctx);
+}
+
+if (!PDFArray.prototype.__fastArrayOnebufInstalled) {
+
+  // ---- PDFArray.prototype -----------------------------------------
+
+  PDFArray.prototype.size = function () {
+    return _length(this.d);
+  };
+
+  PDFArray.prototype.push = function (object) {
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    let dNow = d0;
+    if (start0 + length0 !== arrayMainLen) {
+      dNow = _cow(this);
+    }
+    arrayMain[arrayMainLen++] = object;
+    const start = _start(dNow);
+    this.d = pack(start, length0 + 1);
+  };
+
+  PDFArray.prototype.get = function (index) {
+    return arrayMain[_start(this.d) + index];
+  };
+
+  PDFArray.prototype.set = function (index, object) {
+    arrayMain[_start(this.d) + index] = object;
+  };
+
+  PDFArray.prototype.indexOf = function (object) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    for (let i = 0; i < length; i++) {
+      if (arrayMain[start + i] === object) return i;
+    }
+    return undefined;
+  };
+
+  PDFArray.prototype.insert = function (index, object) {
+    // Always COW -- shifting elements in place would corrupt other
+    // arrays' ranges past this one.
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < index; i++) {
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    arrayMain[arrayMainLen++] = object;
+    for (let i = index; i < length0; i++) {
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    this.d = pack(newStart, length0 + 1);
+  };
+
+  PDFArray.prototype.remove = function (index) {
+    // Always COW (same reason as insert).
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < length0; i++) {
+      if (i === index) continue;
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    this.d = pack(newStart, length0 - 1);
+  };
+
+  PDFArray.prototype.asArray = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length);
+    for (let i = 0; i < length; i++) out[i] = arrayMain[start + i];
+    return out;
+  };
+
+  PDFArray.prototype.clone = function (context) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
+    arrayMainLen += length;
+    _registerContext(context || _singletonContext);
+    return new _FastArray(pack(newStart, length));
+  };
+
+  PDFArray.prototype.toString = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    let s = '[ ';
+    for (let i = 0; i < length; i++) s += arrayMain[start + i].toString() + ' ';
+    return s + ']';
+  };
+
+  PDFArray.prototype.sizeInBytes = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let size = 3;
+    for (let i = start; i < end; i++) size += arrayMain[i].sizeInBytes() + 1;
+    return size;
+  };
+
+  PDFArray.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LeftSquareBracket;
+    buffer[offset++] = CharCodes.Space;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i++) {
+      offset += arrayMain[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+    }
+    buffer[offset++] = CharCodes.RightSquareBracket;
+    return offset - initialOffset;
+  };
+
+  // lookup, lookupMaybe, asRectangle, scalePDFNumbers stay on the
+  // upstream prototype -- they call this.get / this.size / this.set
+  // and dispatch through our overrides.
+
+  Object.defineProperty(PDFArray.prototype, 'context', {
+    get() { return _singletonContext; },
+    set(_ctx) { /* singleton is source of truth */ },
+    configurable: true,
+  });
+
+  // ---- PDFArray factory -------------------------------------------
+
+  PDFArray.withContext = function (context) {
+    return _makeFromAppend([], context);
+  };
+
+  // ---- PDFObjectParser.prototype.parseArray -----------------------
+  //
+  // Same temp/commit pattern as fast-dict-onebuf's parseDict:
+  // each parser instance carries its own _arrayTemp + length cursor;
+  // parseArray pushes elements onto temp's tail, commits the frame
+  // to arrayMain in one contiguous append, pops temp back to
+  // frameStart, returns a PDFArray view into arrayMain.
+
+  PDFObjectParser.prototype.parseArray = function fastParseArrayOneBuf() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LeftSquareBracket);
+    this.skipWhitespaceAndComments();
+
+    if (this._arrayTemp === undefined) {
+      this._arrayTemp = new Array(64);   // grows naturally if needed
+      this._arrayTempLen = 0;
+    }
+    const temp = this._arrayTemp;
+    const frameStart = this._arrayTempLen;
+
+    while (bytes.peek() !== CharCodes.RightSquareBracket) {
+      const element = this.parseObject();   // may recurse
+      temp[this._arrayTempLen++] = element;
+      this.skipWhitespaceAndComments();
+    }
+    bytes.assertNext(CharCodes.RightSquareBracket);
+
+    const frameLen = this._arrayTempLen - frameStart;
+    const start = arrayMainLen;
+    _appendFromTemp(temp, frameStart, frameLen);
+    this._arrayTempLen = frameStart;
+
+    return _makeFromRange(start, frameLen, this.context);
+  };
+
+  PDFArray.prototype.__fastArrayOnebufInstalled = true;
+}
diff --git a/docs/lib/fast-decode-name.mjs b/docs/lib/fast-decode-name.mjs
new file mode 100644
index 0000000..0f20a9f
--- /dev/null
+++ b/docs/lib/fast-decode-name.mjs
@@ -0,0 +1,70 @@
+// Skip pdf-lib's decodeName regex scan when the input has no `#`.
+//
+// The upstream PDFName.of
+// ([PDFName.js:100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:100))
+// is the gatekeeper for every PDFName instance the parser builds:
+//
+//   PDFName.of = function (name) {
+//       var decodedValue = decodeName(name);   // <-- always runs
+//       var instance = pool.get(decodedValue);
+//       if (!instance) { ... }
+//       return instance;
+//   };
+//
+// and decodeName at line 9 is:
+//
+//   name.replace(/#([\dABCDEF]{2})/g, function (_, hex) { ... })
+//
+// PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any
+// byte outside printable-ASCII or for delimiters / whitespace. In
+// real PDFs almost no names use it. Instrumenting on the book:
+//
+//   PDFName.of calls       : 2,759,635
+//     raw input has # char : 2 (0.000%)
+//
+// So decodeName runs a regex scan against 2.76 M strings to find a
+// `#` that's only there twice in the whole load. Profile attributes
+// ~168 ms (7 %) of process self-time to this function.
+//
+// Shim: a parallel Map<string, PDFName> keyed by the raw `name`
+// argument. When `name` contains no `#`, decoded form equals raw
+// form, so our key matches pdf-lib's internal pool key and a hit
+// returns the deduped instance with zero regex work. Misses
+// delegate to the original (which does the regex scan once and
+// stores the instance in pdf-lib's pool); we cache the result so
+// every subsequent occurrence of the same name hits our fast path.
+//
+// Names containing `#` fall through to the original unchanged --
+// the correctness path (e.g. uppercase-only regex, lowercase escapes
+// silently un-decoded) is preserved exactly.
+//
+// Mechanism: PDFName is re-exported from pdf-lib's index, so we can
+// patch PDFName.of directly without reaching into CJS internals.
+// Static initializers (PDFName.Length, .FlateDecode, ...) ran when
+// pdf-lib's module body executed -- before this shim imports -- so
+// pdf-lib's pool is already populated with the canonical instances
+// the parser will see.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-decode-name.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { PDFName } from "pdf-lib";
+
+if (!PDFName.__fastDecodeNameInstalled) {
+  const original = PDFName.of;
+  const fastCache = new Map();
+  PDFName.of = function fastOf(name) {
+    if (name.indexOf("#") === -1) {
+      const cached = fastCache.get(name);
+      if (cached) return cached;
+      const instance = original.call(PDFName, name);
+      fastCache.set(name, instance);
+      return instance;
+    }
+    return original.call(PDFName, name);
+  };
+  PDFName.__fastDecodeNameInstalled = true;
+}
diff --git a/docs/lib/fast-dict-array.mjs b/docs/lib/fast-dict-array.mjs
new file mode 100644
index 0000000..5f70985
--- /dev/null
+++ b/docs/lib/fast-dict-array.mjs
@@ -0,0 +1,328 @@
+// Replace PDFDict's backing Map with a flat alternating array
+// [k0, v0, k1, v1, ...].
+//
+// Motivation. The sampling heap profile of the process phase (see
+// "Profiling pdf-lib heap allocation" in perf/README.md) put `Map`
+// constructors and `Map.prototype.set` at 50 % of total allocations
+// -- ~63 MB combined -- with ~80 % of that traffic coming from one
+// site: fastParseDict's per-dict accumulator
+// ([fast-parse-dict.mjs:62](docs/lib/fast-parse-dict.mjs:62)).
+//
+//     const dict = new Map();          // 24 MB of Map() constructors
+//     while (...) {
+//       const key = this.parseName();
+//       const value = this.parseObject();
+//       dict.set(key, value);          // 38 MB of Map.set entries
+//     }
+//     ... PDFDict.fromMapWithContext(dict, this.context);
+//
+// Each parsed dict pays for one Map header + one hash-table backing
+// arena + one bucket allocation per entry. PDF dicts are tiny (typical
+// has <= 10 entries, often 2-3), so the hash-table overhead is pure
+// loss vs a linear scan -- and the Map's amortized O(1) lookup buys
+// nothing because nobody iterates a parsed dict enough times for the
+// hash to pay back.
+//
+// The fix: store entries in a flat array. One allocation per dict
+// (the array itself; the inline alternating layout avoids any per-
+// entry bucket alloc). Lookup is a linear scan, which beats Map.get
+// at this size class on every V8 microbench I've seen.
+//
+// Mechanism. We do three things:
+//
+// 1. Patch PDFDict.prototype.{keys, values, entries, set, get, has,
+//    delete, asMap, clone, toString, sizeInBytes, copyBytesInto} so
+//    `this.dict` is read as a flat array instead of a Map.
+//    sizeInBytes / copyBytesInto subsume fast-dict-iter.mjs (no
+//    Map.forEach + thisArg context object needed; iteration is just
+//    `for (let i = 0; i < arr.length; i += 2)`).
+//
+// 2. Patch PDFDict.withContext, PDFDict.fromMapWithContext, and the
+//    parallel fromMapWithContext / withContextAndPages helpers on
+//    PDFCatalog / PDFPageTree / PDFPageLeaf, plus PDFPageLeaf's
+//    clone() which constructs `new Map()` directly. Each of these is
+//    rewritten to produce / accept a flat array; the Map argument is
+//    converted at the seam (rare-path cost, only a few dicts per
+//    document hit these factories).
+//
+// 3. Patch PDFObjectParser.prototype.parseDict so the parser's hot
+//    inner loop accumulates into a flat array directly (no Map(), no
+//    Map.set). The Type-sentinel dispatch at the tail becomes a
+//    short linear scan over the array; on dicts that have a /Type
+//    entry it's the first or second key (PDF convention), so the
+//    scan is effectively O(1). This subsumes fast-parse-dict.mjs.
+//
+// Compatibility. Every consumer of `dict.dict.X` inside pdf-lib
+// (ViewerPreferences, AppearanceCharacteristics, PDFAcroField,
+// PDFAcroChoice, PDFAcroText, PDFAcroForm, PDFAnnotation,
+// PDFWidgetAnnotation, BorderStyle, PDFStreamWriter, PDFCrossRefStream,
+// PDFObjectCopier, PDFXRefStreamParser, etc.) goes through
+// PDFDict.prototype methods (.set / .get / .has / .delete / .entries /
+// .lookup), all of which we re-implement to read the array. Nobody in
+// the codebase touches `dict.dict` expecting a Map iterator -- grep
+// confirmed. `asMap()` still returns a fresh `new Map(...)` for any
+// caller that genuinely wants a Map view.
+//
+// This shim is mutually exclusive with --fast-parse-dict and
+// --fast-dict-iter: both are subsumed and would re-install the
+// Map-based methods if loaded afterwards. measure.mjs enforces this.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-dict-array.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Captured canonical PDFNames for the parser's Type-dispatch tail.
+// Pool-dedup ([PDFName.js:18,100]) guarantees reference equality with
+// whatever the parser sees inside the dict.
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+// Map -> flat array. Called at the seam from the factories below; not
+// on the hot parse path.
+function mapToArray(map) {
+  const arr = new Array(map.size * 2);
+  let i = 0;
+  for (const [k, v] of map) { arr[i++] = k; arr[i++] = v; }
+  return arr;
+}
+
+// Linear scan for the index of `key` in [k0, v0, k1, v1, ...]; returns
+// the key-slot index, or -1 if absent.
+function indexOfKey(arr, key) {
+  for (let i = 0, len = arr.length; i < len; i += 2) {
+    if (arr[i] === key) return i;
+  }
+  return -1;
+}
+
+if (!PDFDict.prototype.__fastDictArrayInstalled) {
+
+  // ---- PDFDict.prototype --------------------------------------------
+
+  PDFDict.prototype.keys = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 0, j = 0, len = arr.length; i < len; i += 2, j++) out[j] = arr[i];
+    return out;
+  };
+
+  PDFDict.prototype.values = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 1, j = 0, len = arr.length; i < len; i += 2, j++) out[j] = arr[i];
+    return out;
+  };
+
+  PDFDict.prototype.entries = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 0, j = 0, len = arr.length; i < len; i += 2, j++) {
+      out[j] = [arr[i], arr[i + 1]];
+    }
+    return out;
+  };
+
+  PDFDict.prototype.set = function (key, value) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx >= 0) {
+      arr[idx + 1] = value;
+    } else {
+      arr.push(key, value);
+    }
+  };
+
+  PDFDict.prototype.get = function (key, preservePDFNull) {
+    if (preservePDFNull === undefined) preservePDFNull = false;
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return undefined;
+    const value = arr[idx + 1];
+    if (value === PDFNull && !preservePDFNull) return undefined;
+    return value;
+  };
+
+  PDFDict.prototype.has = function (key) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return false;
+    const value = arr[idx + 1];
+    return value !== undefined && value !== PDFNull;
+  };
+
+  PDFDict.prototype.delete = function (key) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return false;
+    arr.splice(idx, 2);
+    return true;
+  };
+
+  PDFDict.prototype.asMap = function () {
+    const arr = this.dict;
+    const m = new Map();
+    for (let i = 0, len = arr.length; i < len; i += 2) m.set(arr[i], arr[i + 1]);
+    return m;
+  };
+
+  PDFDict.prototype.clone = function (context) {
+    const ctx = context || this.context;
+    const cloned = this.dict.slice();
+    return new PDFDict(cloned, ctx);
+  };
+
+  PDFDict.prototype.toString = function () {
+    const arr = this.dict;
+    let s = '<<\n';
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      s += arr[i].toString() + ' ' + arr[i + 1].toString() + '\n';
+    }
+    return s + '>>';
+  };
+
+  PDFDict.prototype.sizeInBytes = function () {
+    const arr = this.dict;
+    let size = 5;
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      size += arr[i].sizeInBytes() + arr[i + 1].sizeInBytes() + 2;
+    }
+    return size;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const arr = this.dict;
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      offset += arr[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += arr[i + 1].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  // ---- PDFDict factories --------------------------------------------
+
+  PDFDict.withContext = function (context) {
+    return new PDFDict([], context);
+  };
+  PDFDict.fromMapWithContext = function (map, context) {
+    return new PDFDict(mapToArray(map), context);
+  };
+
+  // ---- Subclass factories -------------------------------------------
+  // PDFCatalog.withContextAndPages builds a fresh 2-entry Map; just
+  // hand it the equivalent 2-entry array.
+
+  PDFCatalog.withContextAndPages = function (context, pages) {
+    return new PDFCatalog(
+      [PDFName.of('Type'), CatalogName, PagesName, pages],
+      context,
+    );
+  };
+  PDFCatalog.fromMapWithContext = function (map, context) {
+    return new PDFCatalog(mapToArray(map), context);
+  };
+
+  PDFPageTree.fromMapWithContext = function (map, context) {
+    return new PDFPageTree(mapToArray(map), context);
+  };
+
+  PDFPageLeaf.fromMapWithContext = function (map, context, autoNormalizeCTM) {
+    return new PDFPageLeaf(mapToArray(map), context, autoNormalizeCTM);
+  };
+  // PDFPageLeaf.prototype.clone constructs `new Map()` explicitly,
+  // then copies via this.entries() + clone.set(); since clone.set is
+  // PDFDict.prototype.set (now array-aware), it works as long as
+  // fromMapWithContext receives an empty Map and converts it.
+  // mapToArray(new Map()) yields []; nothing to patch here.
+
+  // ---- PDFObjectParser.prototype.parseDict --------------------------
+  // Subsumes fast-parse-dict.mjs: no `new Map()`, no `dict.set(...)`
+  // in the hot inner loop. The Type-sentinel dispatch at the tail is
+  // a short linear scan; PDF convention places /Type first, so it's
+  // effectively O(1) per dict.
+
+  // Initial capacity for the per-dict accumulator. NOT a scratch
+  // buffer (the array isn't reused across calls -- it's allocated
+  // fresh each dict, filled with parsed entries, and handed to the
+  // PDFDict constructor where it lives as `pdfDict.dict` for the
+  // document's lifetime). Just a pre-sized initial capacity that
+  // skips push-grow's reallocation chain.
+  //
+  // Histogram from the book parse (see instrument-parsedict.mjs):
+  // 5-entry dicts dominate (52 %, exactly 10 push slots), 4-entry
+  // next (28 %, 8 slots), long tail to 7-8 entries. INITIAL_SLOTS =
+  // 10 is exact-fit for the median case; smaller dicts (2/3/4
+  // entries) waste a few slots, larger ones (7+) take one growth
+  // via push. Cuts ~70 bytes of FixedArray-header allocation per
+  // dict vs INITIAL_SLOTS=16 -- on 261 k dict invocations that
+  // adds up.
+  const INITIAL_SLOTS = 10;
+  PDFObjectParser.prototype.parseDict = function fastParseDictArray() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+    const arr = new Array(INITIAL_SLOTS);
+    let len = 0;
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();
+      if (len < INITIAL_SLOTS) {
+        arr[len]     = key;
+        arr[len + 1] = value;
+      } else {
+        // Rare overflow path: set length to current len so push
+        // appends at the right offset, then grow naturally.
+        arr.length = len;
+        arr.push(key, value);
+      }
+      len += 2;
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+    arr.length = len;
+
+    // Type-sentinel dispatch. Inline-scan for TypeName; in practice
+    // it's at arr[0] or arr[2].
+    let Type;
+    for (let i = 0; i < len; i += 2) {
+      if (arr[i] === TypeName) { Type = arr[i + 1]; break; }
+    }
+    if (Type === CatalogName) return new PDFCatalog(arr, this.context);
+    if (Type === PagesName)   return new PDFPageTree(arr, this.context);
+    if (Type === PageName)    return new PDFPageLeaf(arr, this.context);
+    return new PDFDict(arr, this.context);
+  };
+
+  PDFDict.prototype.__fastDictArrayInstalled = true;
+  // Mark the subsumed shims as installed so a redundant load is a no-op.
+  PDFDict.prototype.__fastDictIterInstalled = true;
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/lib/fast-dict-iter.mjs b/docs/lib/fast-dict-iter.mjs
new file mode 100644
index 0000000..1d2a6cb
--- /dev/null
+++ b/docs/lib/fast-dict-iter.mjs
@@ -0,0 +1,81 @@
+// Replace pdf-lib's PDFDict.sizeInBytes and PDFDict.copyBytesInto -- both of
+// which materialize a fresh Array of [key, value] tuples via this.entries()
+// on every call -- with versions that iterate the underlying Map in place.
+//
+// The upstream entries() helper
+// ([PDFDict.js:22](node_modules/pdf-lib/cjs/core/objects/PDFDict.js:22)) is:
+//
+//   PDFDict.prototype.entries = function () {
+//       return Array.from(this.dict.entries());
+//   };
+//
+// Per call that is: one MapIterator + one outer Array + one fresh
+// [key, value] tuple per entry (allocated by the iterator itself). The save
+// path fires both consumers on every dict (sizeInBytes to measure first,
+// then copyBytesInto to write), so on the book that's ~100 k Array.from
+// calls feeding the GC; PDFDict.entries was the largest non-GC row in the
+// process profile (~10 % of process self-time) and (garbage collector) sat
+// at the top.
+//
+// Map.prototype.forEach((value, key) => ...) calls back with positional
+// arguments and never allocates a tuple. The two consumers don't need the
+// tuple form -- they immediately destructure -- so swapping is local.
+//
+// We do NOT touch PDFDict.prototype.entries itself: clone() and toString()
+// still call it and rely on the Array-of-tuples contract. Those paths fire
+// rarely (clone on incremental updates only, toString in debug output) and
+// aren't worth the contract churn.
+//
+// Side-effecting import. Import once before any pdf-lib save:
+//
+//   import "./lib/fast-dict-iter.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require    = createRequire(import.meta.url);
+const PDFDict    = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const CharCodes  = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Callbacks are module-level (not closures) so Map.forEach reuses the same
+// function reference on every call instead of allocating a fresh context
+// per invocation. Per-call state is threaded through forEach's `thisArg`
+// (one small object alloc per call, instead of one closure context plus
+// one heap cell for the captured `offset` mutation).
+function _sizeInBytesEntry(value, key) {
+  this.s += key.sizeInBytes() + value.sizeInBytes() + 2;
+}
+
+function _copyBytesIntoEntry(value, key) {
+  const buf = this.buf;
+  let off = this.off;
+  off += key.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Space;
+  off += value.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Newline;
+  this.off = off;
+}
+
+if (!PDFDict.prototype.__fastDictIterInstalled) {
+  PDFDict.prototype.sizeInBytes = function () {
+    const ctx = { s: 5 };
+    this.dict.forEach(_sizeInBytesEntry, ctx);
+    return ctx.s;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const ctx = { buf: buffer, off: offset };
+    this.dict.forEach(_copyBytesIntoEntry, ctx);
+    offset = ctx.off;
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  PDFDict.prototype.__fastDictIterInstalled = true;
+}
diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
new file mode 100644
index 0000000..888705c
--- /dev/null
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -0,0 +1,546 @@
+// One-buffer PDFDict: every committed entry lives in a single
+// append-only array (main), kept for the document's lifetime. The
+// parser uses a small per-instance temp array as a stack of recursion
+// frames; each parseDict invocation appends to temp, commits its
+// frame to main in one contiguous range, and pops temp back. After
+// parseDocument completes, temp is released. PDFDict instances only
+// ever read from main, so the bufIdx field disappears from the
+// packed value -- frees up bits.
+//
+// 41-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
+//   bits  0-22: start  (23 bits, max 8.4 M slots in main; mainLen ~2.3 M today)
+//   bit     23: PDFPageLeaf `normalized` flag (zero on all other dict subtypes)
+//   bit     24: PDFPageLeaf `autoNormalizeCTM` flag (zero on all other dict subtypes)
+//   bits 25-40: length (16 bits, max 65 535 slots; max observed 8 706)
+//   bits 41-52: spare (12 bits; unused, available headroom)
+//
+// V8 Smi (31-bit signed) covers values < 2^30. start + length*2^25 stays
+// Smi iff length < 32 (the 2^30 boundary). Beyond that, `d` boxes to a
+// HeapNumber but bit math via `& MASK_*` and `+`/`-` continues to work --
+// reads still extract bits 0..30 correctly via Int32 coercion, writes
+// use arithmetic so high bits survive.
+//
+// PDFPageLeaf collapses to the same single-`d` field as plain PDFDict;
+// `normalized` and `autoNormalizeCTM` are gettters/setters that mask
+// in/out of `d`'s bits 23 and 24. Heap floor matches `_FastDict` (no
+// separate boolean property slots).
+//
+// Recursion. Outer parseDict pushes entries onto temp. Calling
+// this.parseObject() to parse a value may recurse to inner
+// parseDict, which appends ON TOP of outer's pending entries. Inner
+// commits its frame to main in one append, then pops temp back to
+// the level it started at -- outer's frame is intact at the top of
+// temp again. Outer continues, eventually committing its (now
+// contiguous in temp) entries to main in one append. Outer's and
+// inner's ranges in main do not overlap; each was committed as a
+// single contiguous block at distinct points in time.
+//
+// Mutations:
+//   - set with existing key: in-place replace (safe; no shifts)
+//   - set with new key, dict at main's high-water mark: in-place
+//     push (extend the range)
+//   - set with new key, dict NOT at high-water mark: COW (copy
+//     range to main's tail, then push the new pair, update encoded
+//     value to the new range)
+//   - delete: COW (copy range minus deleted entry to tail)
+// The at-HWM check fully determines whether extending is safe;
+// each dict's range is unique to that dict (no slot sharing), so
+// extending past the dict's end at HWM never disturbs anything.
+// An earlier design tracked an owned/shared bit to gate this; it
+// was redundant -- shared dicts at HWM extend just as safely as
+// owned ones.
+//
+// Singleton PDFContext (one PDFDocument.load per process in our
+// pipeline; throws if a second distinct context appears).
+//
+// Mutually exclusive with --fast-dict-double / --fast-dict-view /
+// --fast-dict-array.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+// ---- The single buffer + temp ---------------------------------------
+
+// Pre-sized to total entries + slack measured on the book. Other
+// workloads grow it naturally (V8-amortized array growth from this
+// starting size). When the measure-pass shim runs first, it calls
+// setExpectedDictSlots() before parse, which resizes `main` to exact
+// measured demand via `main.length = N`.
+const MAIN_INITIAL_CAP = 2400000;
+const main = new Array(MAIN_INITIAL_CAP);
+let mainLen = 0;
+
+// Exposed for measurement-only consumers (perf/instrument-*.mjs).
+// The encoded `d` values held by PDFDict instances reference main by
+// (start, length); reading the slots requires access to main itself.
+export { main };
+export function getMainLen() { return mainLen; }
+
+// Replace `main` with an exact-sized backing array. Must be called
+// before any parseDict / withContext / fromMapWithContext (i.e. while
+// mainLen is still 0). `slack` is a multiplier on `slots`; default 1.0
+// (exact). Use a small slack only if the measure pass is approximate.
+export function setExpectedDictSlots(slots, slack = 1.0) {
+  if (mainLen > 0) {
+    throw new Error(
+      `fast-dict-onebuf: setExpectedDictSlots called after parse started (mainLen=${mainLen})`,
+    );
+  }
+  const sized = Math.ceil(slots * slack);
+  // Resize in place rather than reassigning. Reassigning the module-
+  // level `main` binding invalidates V8's inline-cache slots in every
+  // closure that reads it -- the closures get deopted on first call
+  // and recompile against the new array, with a parse-time allocation
+  // spike attributed to _appendEntries (~27 MB sampled on the book).
+  // `main.length = N` keeps the same Array identity; ICs stay valid.
+  main.length = sized;
+}
+
+// ---- Bit-packing helpers --------------------------------------------
+
+const POW_23  = 1 << 23;            // 8 388 608  -- gap-bit base / start ceiling
+const POW_25  = 1 << 25;            // 33 554 432 -- length multiplier
+const MASK_23 = 0x7FFFFF;           // 23-bit start mask
+const MASK_16 = 0xFFFF;             // 16-bit length mask
+
+const NORM_BIT = POW_23;            // bit 23: PDFPageLeaf `normalized`
+const AUTO_BIT = POW_23 * 2;        // bit 24: PDFPageLeaf `autoNormalizeCTM`
+const GAP_MASK = NORM_BIT | AUTO_BIT;
+
+const MAX_START  = POW_23;          // exclusive
+const MAX_LENGTH = 1 << 16;         // 65536, exclusive
+
+function pack(start, length) {
+  if (start  >= MAX_START)  throw new Error(`fast-dict-onebuf: start ${start} exceeds 23-bit budget`);
+  if (length >= MAX_LENGTH) throw new Error(`fast-dict-onebuf: length ${length} exceeds 16-bit budget`);
+  return start + length * POW_25;
+}
+
+// Read start (bits 0-22) and length (bits 25-40). Both work on
+// HeapNumber'd d: `& MASK_23` lives in low 32 bits (Int32 coercion
+// reads it correctly); `Math.floor(d / POW_25)` operates on the full
+// Number range before the `& MASK_16` truncates.
+function _start(d)  { return d & MASK_23; }
+function _length(d) { return Math.floor(d / POW_25) & MASK_16; }
+
+// ---- Singleton context ---------------------------------------------
+
+let _singletonContext = null;
+
+function _registerContext(ctx) {
+  if (_singletonContext === null) {
+    _singletonContext = ctx;
+  } else if (_singletonContext !== ctx) {
+    throw new Error('fast-dict-onebuf: expected a singleton PDFContext, got a second distinct one.');
+  }
+}
+
+// ---- Append helpers ------------------------------------------------
+
+function _appendEntries(entries, fromOffset, lenSlots) {
+  for (let i = 0; i < lenSlots; i++) {
+    main[mainLen + i] = entries[fromOffset + i];
+  }
+  mainLen += lenSlots;
+}
+
+function _appendArray(arr) {
+  const len = arr.length;
+  for (let i = 0; i < len; i++) main[mainLen + i] = arr[i];
+  mainLen += len;
+}
+
+// COW: copy this dict's range to main's tail, return the new packed
+// value anchored at the new range. If we're already at the HWM,
+// nothing to copy -- return d unchanged.
+//
+// Gap bits (bits 23-24, used by PDFPageLeaf for normalized /
+// autoNormalizeCTM) are preserved across the repack. For non-PageLeaf
+// dicts the mask is zero, so `+ (d & GAP_MASK)` is a no-op. Addition
+// is used instead of `|` so the high bits of HeapNumber'd d survive.
+function _cow(pd) {
+  const d = pd.d;
+  const start = _start(d);
+  const length = _length(d);
+  if (start + length === mainLen) return d;   // at HWM, extend in place
+  const newStart = mainLen;
+  for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+  mainLen += length;
+  return pack(newStart, length) + (d & GAP_MASK);
+}
+
+// ---- Construction ---------------------------------------------------
+//
+// Use plain-function constructors with the prototype aliased to the
+// upstream PDFDict / PDFCatalog / PDFPageTree / PDFPageLeaf prototypes
+// instead of `Object.create(proto) + property writes`. V8 gives
+// `new`-built instances a stable hidden class derived from the
+// assignment order in the constructor body, and per-instance heap cost
+// drops materially vs the slow-property path taken by Object.create +
+// later writes (the same shape change that fast-refs-class made for
+// PDFRef: ~60 B/instance -> ~44 B). For the 260 k+ dicts on the book
+// the per-instance gap × instance count is the dominant remaining heap
+// row.
+//
+// One constructor per subclass so V8 sees a single fixed shape per
+// kind. PDFPageLeaf collapses to the same single-`d` shape as plain
+// PDFDict; `normalized` defaults to false (gap bit 23 clear) and
+// `autoNormalizeCTM` defaults to true (gap bit 24 set) -- the bit
+// is OR'd in by the constructor below via addition (so HeapNumber'd
+// d doesn't lose high bits to Int32 coercion). Both flags become
+// prototype getters/setters that mask in/out of bits 23-24.
+// Any unknown PDFDict subclass falls back to the original
+// Object.create path so the shim doesn't crash on downstream
+// extensions (none in our pipeline; defensive only).
+
+function _FastDict(d) { this.d = d; }
+_FastDict.prototype = PDFDict.prototype;
+
+function _FastCatalog(d) { this.d = d; }
+_FastCatalog.prototype = PDFCatalog.prototype;
+
+function _FastPageTree(d) { this.d = d; }
+_FastPageTree.prototype = PDFPageTree.prototype;
+
+// d arrives from pack(start, length) so bits 23-24 are zero;
+// `+ AUTO_BIT` sets bit 24 unconditionally (autoNormalizeCTM = true
+// default). Use addition not `|`: if length >= 32, d > 2^30 (HeapNumber)
+// and `|` would truncate to Int32 losing high bits.
+function _FastPageLeaf(d) { this.d = d + AUTO_BIT; }
+_FastPageLeaf.prototype = PDFPageLeaf.prototype;
+
+function _makeFromRange(ProtoClass, start, length, ctx) {
+  _registerContext(ctx);
+  const d = pack(start, length);
+  if (ProtoClass === PDFDict)      return new _FastDict(d);
+  if (ProtoClass === PDFPageLeaf)  return new _FastPageLeaf(d);
+  if (ProtoClass === PDFCatalog)   return new _FastCatalog(d);
+  if (ProtoClass === PDFPageTree)  return new _FastPageTree(d);
+  // Defensive fallback for any unknown subclass.
+  const pd = Object.create(ProtoClass.prototype);
+  pd.d = d;
+  return pd;
+}
+
+function _makeFromAppend(ProtoClass, arr, ctx) {
+  const start = mainLen;
+  _appendArray(arr);
+  return _makeFromRange(ProtoClass, start, arr.length, ctx);
+}
+
+function mapToArray(map) {
+  const arr = new Array(map.size * 2);
+  let i = 0;
+  for (const [k, v] of map) { arr[i++] = k; arr[i++] = v; }
+  return arr;
+}
+
+if (!PDFDict.prototype.__fastDictOnebufInstalled) {
+
+  // ---- PDFDict.prototype --------------------------------------------
+
+  PDFDict.prototype.keys = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) out[j] = main[start + i];
+    return out;
+  };
+
+  PDFDict.prototype.values = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) out[j] = main[start + i + 1];
+    return out;
+  };
+
+  PDFDict.prototype.entries = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) {
+      out[j] = [main[start + i], main[start + i + 1]];
+    }
+    return out;
+  };
+
+  PDFDict.prototype.set = function (key, value) {
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    // Try in-place replace
+    for (let i = 0; i < length0; i += 2) {
+      if (main[start0 + i] === key) { main[start0 + i + 1] = value; return; }
+    }
+    // Append: requires the dict to be at main's high-water mark, OR we COW.
+    let dNow = d0;
+    if (start0 + length0 !== mainLen) {
+      dNow = _cow(this);
+    }
+    // After _cow (or if we were already at HWM), we abut the tail.
+    main[mainLen++] = key;
+    main[mainLen++] = value;
+    const start = _start(dNow);
+    // Preserve gap bits (PageLeaf flags) from dNow into the freshly
+    // packed value. Zero for non-PageLeaf dicts.
+    this.d = pack(start, length0 + 2) + (dNow & GAP_MASK);
+  };
+
+  PDFDict.prototype.get = function (key, preservePDFNull) {
+    if (preservePDFNull === undefined) preservePDFNull = false;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === key) {
+        const value = main[i + 1];
+        if (value === PDFNull && !preservePDFNull) return undefined;
+        return value;
+      }
+    }
+    return undefined;
+  };
+
+  PDFDict.prototype.has = function (key) {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === key) {
+        const value = main[i + 1];
+        return value !== undefined && value !== PDFNull;
+      }
+    }
+    return false;
+  };
+
+  PDFDict.prototype.delete = function (key) {
+    // Always COW for delete: shifting slots in main would corrupt
+    // other dicts that point into the affected region.
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    let foundIdx = -1;
+    for (let i = 0; i < length0; i += 2) {
+      if (main[start0 + i] === key) { foundIdx = i; break; }
+    }
+    if (foundIdx < 0) return false;
+    const newStart = mainLen;
+    for (let i = 0; i < length0; i++) {
+      if (i === foundIdx || i === foundIdx + 1) continue;
+      main[mainLen++] = main[start0 + i];
+    }
+    // Preserve gap bits (PageLeaf flags); zero for non-PageLeaf dicts.
+    this.d = pack(newStart, length0 - 2) + (d0 & GAP_MASK);
+    return true;
+  };
+
+  PDFDict.prototype.asMap = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    const m = new Map();
+    for (let i = start; i < end; i += 2) m.set(main[i], main[i + 1]);
+    return m;
+  };
+
+  PDFDict.prototype.clone = function (context) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const newStart = mainLen;
+    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+    mainLen += length;
+    _registerContext(context || _singletonContext);
+    return new _FastDict(pack(newStart, length));
+  };
+
+  PDFDict.prototype.toString = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let s = '<<\n';
+    for (let i = start; i < end; i += 2) {
+      s += main[i].toString() + ' ' + main[i + 1].toString() + '\n';
+    }
+    return s + '>>';
+  };
+
+  PDFDict.prototype.sizeInBytes = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let size = 5;
+    for (let i = start; i < end; i += 2) {
+      size += main[i].sizeInBytes() + main[i + 1].sizeInBytes() + 2;
+    }
+    return size;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      offset += main[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += main[i + 1].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  Object.defineProperty(PDFDict.prototype, 'context', {
+    get() { return _singletonContext; },
+    set(_ctx) { /* singleton is source of truth */ },
+    configurable: true,
+  });
+
+  // ---- PDFPageLeaf flag accessors -----------------------------------
+  //
+  // `normalized` and `autoNormalizeCTM` live in bits 23 and 24 of
+  // `d`. Reads use `& BIT` -- safe on HeapNumber'd d because both
+  // bits are in the low 32 (Int32 coercion reads them correctly).
+  // Writes use arithmetic (`d + BIT` / `d - BIT`) gated on the
+  // current bit state, so high bits of HeapNumber'd d survive.
+  // No-ops when the flag is already in the requested state.
+
+  Object.defineProperty(PDFPageLeaf.prototype, 'normalized', {
+    get() { return (this.d & NORM_BIT) !== 0; },
+    set(v) {
+      const d = this.d;
+      const has = (d & NORM_BIT) !== 0;
+      if (v && !has)      this.d = d + NORM_BIT;
+      else if (!v && has) this.d = d - NORM_BIT;
+    },
+    configurable: true,
+  });
+
+  Object.defineProperty(PDFPageLeaf.prototype, 'autoNormalizeCTM', {
+    get() { return (this.d & AUTO_BIT) !== 0; },
+    set(v) {
+      const d = this.d;
+      const has = (d & AUTO_BIT) !== 0;
+      if (v && !has)      this.d = d + AUTO_BIT;
+      else if (!v && has) this.d = d - AUTO_BIT;
+    },
+    configurable: true,
+  });
+
+  // ---- PDFDict factories --------------------------------------------
+
+  PDFDict.withContext = function (context) {
+    return _makeFromAppend(PDFDict, [], context);
+  };
+  PDFDict.fromMapWithContext = function (map, context) {
+    return _makeFromAppend(PDFDict, mapToArray(map), context);
+  };
+
+  PDFCatalog.withContextAndPages = function (context, pages) {
+    return _makeFromAppend(
+      PDFCatalog,
+      [PDFName.of('Type'), CatalogName, PagesName, pages],
+      context,
+    );
+  };
+  PDFCatalog.fromMapWithContext = function (map, context) {
+    return _makeFromAppend(PDFCatalog, mapToArray(map), context);
+  };
+
+  PDFPageTree.fromMapWithContext = function (map, context) {
+    return _makeFromAppend(PDFPageTree, mapToArray(map), context);
+  };
+
+  PDFPageLeaf.fromMapWithContext = function (map, context, autoNormalizeCTM) {
+    const d = _makeFromAppend(PDFPageLeaf, mapToArray(map), context);
+    if (autoNormalizeCTM !== undefined) d.autoNormalizeCTM = autoNormalizeCTM;
+    return d;
+  };
+
+  // ---- PDFObjectParser.prototype.parseDict --------------------------
+  //
+  // Each parser instance carries its own temp array (small; sized to
+  // peak recursion-depth-stack of entries) plus a length cursor.
+  // parseDict pushes entries onto temp's tail; on completion, commits
+  // its frame to main in one contiguous append, pops temp back to
+  // frameStart, and returns a PDFDict view into main.
+
+  PDFObjectParser.prototype.parseDict = function fastParseDictOneBuf() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+
+    if (this._dictTemp === undefined) {
+      this._dictTemp = new Array(64);   // grows naturally if needed
+      this._dictTempLen = 0;
+    }
+    const temp = this._dictTemp;
+    const frameStart = this._dictTempLen;
+
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();    // may recurse; temp grows / shrinks
+      const len = this._dictTempLen;
+      temp[len]     = key;
+      temp[len + 1] = value;
+      this._dictTempLen = len + 2;
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+
+    const frameLen = this._dictTempLen - frameStart;
+    // Commit this frame to main in one contiguous append
+    const start = mainLen;
+    _appendEntries(temp, frameStart, frameLen);
+    // Pop our frame off temp
+    this._dictTempLen = frameStart;
+
+    // Type-sentinel dispatch (scan the frame we just committed)
+    let Type;
+    const end = start + frameLen;
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === TypeName) { Type = main[i + 1]; break; }
+    }
+    if (Type === CatalogName) return _makeFromRange(PDFCatalog,  start, frameLen, this.context);
+    if (Type === PagesName)   return _makeFromRange(PDFPageTree, start, frameLen, this.context);
+    if (Type === PageName)    return _makeFromRange(PDFPageLeaf, start, frameLen, this.context);
+    return _makeFromRange(PDFDict, start, frameLen, this.context);
+  };
+
+  PDFDict.prototype.__fastDictOnebufInstalled = true;
+  // Mark subsumed shims as installed.
+  PDFDict.prototype.__fastDictDoubleInstalled = true;
+  PDFDict.prototype.__fastDictViewInstalled = true;
+  PDFDict.prototype.__fastDictArrayInstalled = true;
+  PDFDict.prototype.__fastDictIterInstalled = true;
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/lib/fast-indirect-objects.mjs b/docs/lib/fast-indirect-objects.mjs
new file mode 100644
index 0000000..9058414
--- /dev/null
+++ b/docs/lib/fast-indirect-objects.mjs
@@ -0,0 +1,174 @@
+// Replace PDFContext.indirectObjects (Map<PDFRef, PDFObject>) with a
+// dense array keyed by objectNumber for the gen=0 path.
+//
+// Motivation. After fast-dict-array shipped, the only remaining hot
+// Map.set in the process-phase heap profile was
+// PDFContext.assign's `this.indirectObjects.set(ref, object)`:
+//
+//     $ node find-heap-callers.mjs <post-ship>.heapprofile set
+//     set: total=14.49 MB
+//       7168.04 KB   PDFParser.parseIndirectObjectHeader
+//       7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+//        ...
+//
+// (Both ~7 MB rows are V8 inline-attribution duplicates of the same
+// logical call.) That's 14.5 MB of Map traffic for one Map -- one
+// `set` per indirect object during load, with the hash table
+// rebuilding through ~14 doubling steps to fit the book's ~9 k
+// indirect objects, discarding each intermediate arena to GC.
+//
+// PDFRefs are overwhelmingly gen=0 (revisions / incremental updates
+// are the only gen!=0 producers, and they're rare). fast-refs.mjs
+// already exploits this on the key side -- a dense array indexed by
+// objectNumber for the PDFRef pool, Map fallback for gen!=0. This
+// shim does the same on the value side for PDFContext.indirectObjects.
+//
+// Mechanism. Patch PDFContext.prototype.assign / lookup / lookupMaybe
+// / delete / getObjectRef / enumerateIndirectObjects to consult an
+// auxiliary `this._objArr` (dense array indexed by objectNumber) for
+// gen=0 PDFRefs first, falling back to the original Map for gen!=0.
+// The dense array is created lazily on first assign so we don't need
+// to touch the constructor.
+//
+// The original `this.indirectObjects` Map is left in place for two
+// reasons: (a) gen!=0 entries actually need it, and (b) external code
+// that reads `pdfContext.indirectObjects` directly (none in our
+// pipeline, but reasonable to defensive-preserve) continues to see a
+// Map-shaped object -- just usually empty.
+//
+// As a side benefit, `enumerateIndirectObjects` no longer needs to
+// sort: dense-array iteration is already in ascending objectNumber
+// order. (The Map-sourced gen!=0 entries are merged in sorted.)
+//
+// Side-effecting import. Import once before any PDFDocument.load:
+//
+//   import "./lib/fast-indirect-objects.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFContext = require('pdf-lib/cjs/core/PDFContext.js').default;
+const PDFRef     = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFNull    = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const UnexpectedObjectTypeError = require('pdf-lib/cjs/core/errors.js').UnexpectedObjectTypeError;
+
+const byAscendingObjectNumber = ([a], [b]) => a.objectNumber - b.objectNumber;
+
+if (!PDFContext.prototype.__fastIndirectObjectsInstalled) {
+
+  // ---- assign -------------------------------------------------------
+  // Hot path. gen=0 → dense array store; gen!=0 → Map. Maintains
+  // largestObjectNumber as before.
+
+  PDFContext.prototype.assign = function (ref, object) {
+    if (ref.generationNumber === 0) {
+      if (!this._objArr) this._objArr = [];
+      this._objArr[ref.objectNumber] = object;
+    } else {
+      this.indirectObjects.set(ref, object);
+    }
+    if (ref.objectNumber > this.largestObjectNumber) {
+      this.largestObjectNumber = ref.objectNumber;
+    }
+  };
+
+  // ---- delete -------------------------------------------------------
+  // Returns true iff something was removed. Dense slots are nulled
+  // (not spliced) so subsequent objectNumbers retain their slots.
+
+  PDFContext.prototype.delete = function (ref) {
+    if (ref.generationNumber === 0 && this._objArr) {
+      const slot = this._objArr[ref.objectNumber];
+      if (slot !== undefined) {
+        this._objArr[ref.objectNumber] = undefined;
+        return true;
+      }
+      return false;
+    }
+    return this.indirectObjects.delete(ref);
+  };
+
+  // ---- lookup / lookupMaybe -----------------------------------------
+  // Resolve the ref to an object via the dense array (gen=0) or Map
+  // (gen!=0), then run the original type-check tail verbatim.
+
+  function _resolve(ctx, ref) {
+    if (!(ref instanceof PDFRef)) return ref;
+    if (ref.generationNumber === 0 && ctx._objArr) {
+      return ctx._objArr[ref.objectNumber];
+    }
+    return ctx.indirectObjects.get(ref);
+  }
+
+  PDFContext.prototype.lookupMaybe = function (ref) {
+    const types = [];
+    for (let i = 1, len = arguments.length; i < len; i++) types[i - 1] = arguments[i];
+    const preservePDFNull = types.includes(PDFNull);
+    const result = _resolve(this, ref);
+    if (!result || (result === PDFNull && !preservePDFNull)) return undefined;
+    for (let idx = 0, len = types.length; idx < len; idx++) {
+      const type = types[idx];
+      if (type === PDFNull) {
+        if (result === PDFNull) return result;
+      } else {
+        if (result instanceof type) return result;
+      }
+    }
+    throw new UnexpectedObjectTypeError(types, result);
+  };
+
+  PDFContext.prototype.lookup = function (ref) {
+    const types = [];
+    for (let i = 1, len = arguments.length; i < len; i++) types[i - 1] = arguments[i];
+    const result = _resolve(this, ref);
+    if (types.length === 0) return result;
+    for (let idx = 0, len = types.length; idx < len; idx++) {
+      const type = types[idx];
+      if (type === PDFNull) {
+        if (result === PDFNull) return result;
+      } else {
+        if (result instanceof type) return result;
+      }
+    }
+    throw new UnexpectedObjectTypeError(types, result);
+  };
+
+  // ---- getObjectRef -------------------------------------------------
+  // Linear scan. Dense array first (gen=0 PDFRef reconstructed from
+  // objectNumber via PDFRef.of, which fast-refs has cached). Fall
+  // back to Map for any gen!=0 candidates.
+
+  PDFContext.prototype.getObjectRef = function (pdfObject) {
+    if (this._objArr) {
+      for (let i = 0, len = this._objArr.length; i < len; i++) {
+        if (this._objArr[i] === pdfObject) return PDFRef.of(i, 0);
+      }
+    }
+    for (const entry of this.indirectObjects) {
+      if (entry[1] === pdfObject) return entry[0];
+    }
+    return undefined;
+  };
+
+  // ---- enumerateIndirectObjects -------------------------------------
+  // Dense array is already iterable in objectNumber order. Merge in
+  // any gen!=0 entries from the Map and sort once -- but only if the
+  // Map is non-empty (the common case for parsed PDFs is empty).
+
+  PDFContext.prototype.enumerateIndirectObjects = function () {
+    const out = [];
+    if (this._objArr) {
+      for (let i = 0, len = this._objArr.length; i < len; i++) {
+        const obj = this._objArr[i];
+        if (obj !== undefined) out.push([PDFRef.of(i, 0), obj]);
+      }
+    }
+    if (this.indirectObjects.size === 0) return out;
+    for (const entry of this.indirectObjects) out.push(entry);
+    return out.sort(byAscendingObjectNumber);
+  };
+
+  PDFContext.prototype.__fastIndirectObjectsInstalled = true;
+}
diff --git a/docs/lib/fast-inflate.mjs b/docs/lib/fast-inflate.mjs
new file mode 100644
index 0000000..db675d5
--- /dev/null
+++ b/docs/lib/fast-inflate.mjs
@@ -0,0 +1,39 @@
+// Replace pako's pure-JS inflate with Node's zlib for the one path
+// pdf-lib actually uses it on: PDFCrossRefStreamParser inflating the
+// compressed cross-reference stream during PDFDocument.load. Exactly
+// one call per load on Chrome-emitted PDFs (PDF 1.5+ xref-stream
+// format), ~4.5 KB input. Negligible wall-clock, but it's the last
+// remaining pdf-lib -> pako call site once parallelSave has taken
+// over the deflate side -- this brings the runtime pako call count
+// to zero.
+//
+// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950):
+// 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte Adler-32
+// trailer. Both pako.inflate and zlib.inflateSync consume that
+// format, so the swap is wire-compatible.
+//
+// Mechanism: pdf-lib is CJS in node_modules and calls
+// `require("pako").inflate(...)` at the call site, not at import
+// time. Mutating the live pako exports object is enough; no fork
+// required.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-inflate.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { inflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastInflateInstalled) {
+  const original = pako.inflate;
+  pako.inflate = function fastInflate(data, options) {
+    // pdf-lib's only caller passes no options. Anything fancier
+    // (dictionary, raw, custom windowBits) goes back to pako so we
+    // don't change behaviour outside the one path we care about.
+    if (options) return original.call(pako, data, options);
+    return inflateSync(data);
+  };
+  pako.__fastInflateInstalled = true;
+}
diff --git a/docs/lib/fast-number-to-string.mjs b/docs/lib/fast-number-to-string.mjs
new file mode 100644
index 0000000..57640a9
--- /dev/null
+++ b/docs/lib/fast-number-to-string.mjs
@@ -0,0 +1,65 @@
+// Skip pdf-lib's numberToString redundant work when the input doesn't
+// stringify to exponential notation.
+//
+// The upstream numberToString
+// ([numbers.js:13](node_modules/pdf-lib/cjs/utils/numbers.js:13)) is:
+//
+//   exports.numberToString = function (num) {
+//       var numStr = String(num);
+//       if (Math.abs(num) < 1.0) {
+//           var e = parseInt(num.toString().split('e-')[1]);
+//           if (e) { ... }
+//       } else {
+//           var e = parseInt(num.toString().split('+')[1]);
+//           if (e > 20) { ... }
+//       }
+//       return numStr;
+//   };
+//
+// It always computes `numStr = String(num)` up front -- but then
+// re-calls `num.toString()`, allocates a `.split(...)` array, and
+// runs parseInt on the result, even though `numStr` is already what
+// `.toString()` returns. Exponential notation in `String(num)` only
+// appears for |num| < 1e-6 or |num| >= 1e21, neither of which real
+// PDFs emit: object refs, generations, byte offsets, content-stream
+// coordinates, /Size, /Length, etc. all stringify to plain decimal.
+//
+// Shim: short-circuit when `String(num)` contains no `'e'` and return
+// it immediately. The rare exponential cases fall through to the
+// original so the spec-compliant expansion logic is preserved.
+//
+// Why three patches and not one: pdf-lib ships compiled against
+// tslib 1.x, whose `__exportStar` does a value-copy (`exports[p] =
+// m[p]`) rather than installing a live getter. So by the time
+// PDFNumber.js's `index_1.numberToString(value)` runs, `index_1` (the
+// utils/index barrel) holds a captured reference to the original
+// function, and mutating `numbers.numberToString` alone is invisible
+// to the call site. We patch the captured copies along the re-export
+// chain: utils/numbers (source), utils/index (the barrel PDFNumber
+// reads from), and pdf-lib's top-level index (the public surface).
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-number-to-string.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const numbers     = require('pdf-lib/cjs/utils/numbers.js');
+const utilsBarrel = require('pdf-lib/cjs/utils/index.js');
+const topBarrel   = require('pdf-lib/cjs/index.js');
+
+if (!numbers.__fastNumberToStringInstalled) {
+  const original = numbers.numberToString;
+  const fastNumberToString = function fastNumberToString(num) {
+    const numStr = String(num);
+    if (numStr.indexOf('e') === -1) return numStr;
+    return original(num);
+  };
+  numbers.numberToString     = fastNumberToString;
+  utilsBarrel.numberToString = fastNumberToString;
+  topBarrel.numberToString   = fastNumberToString;
+  numbers.__fastNumberToStringInstalled = true;
+}
diff --git a/docs/lib/fast-parse-dict.mjs b/docs/lib/fast-parse-dict.mjs
new file mode 100644
index 0000000..203549c
--- /dev/null
+++ b/docs/lib/fast-parse-dict.mjs
@@ -0,0 +1,87 @@
+// Hoist the four sentinel PDFName.of calls out of
+// PDFObjectParser.prototype.parseDict.
+//
+// The upstream parseDict
+// ([PDFObjectParser.js:141](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:141))
+// ends every dict it parses with a Type-dispatch tail:
+//
+//   var Type = dict.get(PDFName.of('Type'));
+//   if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...);
+//   else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...);
+//   else if (Type === PDFName.of('Page'))  return PDFPageLeaf.fromMapWithContext(...);
+//   else                                   return PDFDict.fromMapWithContext(...);
+//
+// That's 4 PDFName.of calls per dict, even on the overwhelming
+// majority (resource dicts, font descriptors, content-stream dicts)
+// that have no /Type entry at all. With --fast-decode-name in
+// effect each call collapses to a Map.get on fastCache, but
+// fastOf is still the #4 row in process.cpuprofile (~80 ms,
+// 5.2 %).
+//
+// PDFName instances are pool-deduped
+// ([PDFName.js:18,100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:18))
+// so the sentinel "Type" / "Catalog" / "Pages" / "Page" PDFNames
+// are reference-stable for the entire load. Capture them once at
+// shim-load time and substitute direct constants for the four
+// PDFName.of calls inside parseDict. The rest of the function
+// body is preserved verbatim -- same loop, same dict.set, same
+// dispatch shape.
+//
+// Mechanism: PDFObjectParser isn't re-exported by pdf-lib's index,
+// so we reach in through the CJS internals via createRequire (same
+// shape as fast-parse-number.mjs / fast-dict-iter.mjs). Mutating
+// PDFObjectParser.prototype.parseDict is global -- every parser
+// instance created after this shim loads picks it up.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-dict.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Capture canonical PDFName instances. Pool-dedup guarantees the
+// parser would have built === these even if the original parseDict
+// were still in play.
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+if (!PDFObjectParser.prototype.__fastParseDictInstalled) {
+  PDFObjectParser.prototype.parseDict = function fastParseDict() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+    const dict = new Map();
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();
+      dict.set(key, value);
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+    const Type = dict.get(TypeName);
+    if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context);
+    if (Type === PagesName)   return PDFPageTree.fromMapWithContext(dict, this.context);
+    if (Type === PageName)    return PDFPageLeaf.fromMapWithContext(dict, this.context);
+    return PDFDict.fromMapWithContext(dict, this.context);
+  };
+
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/lib/fast-parse-name.mjs b/docs/lib/fast-parse-name.mjs
new file mode 100644
index 0000000..5da62fa
--- /dev/null
+++ b/docs/lib/fast-parse-name.mjs
@@ -0,0 +1,146 @@
+// Byte-keyed cache in front of parseName: on cache hit (99.7 % of
+// calls on the book) return the existing PDFName without allocating
+// the lookup string at all.
+//
+// Step 1 of this optimisation (commit history shows the failed
+// attempt) hand-inlined parseName's byte loop to skip the
+// `this.bytes.peek() / .next() / .done()` per-byte method dispatch
+// while keeping the original cons-string accumulator. CPU didn't move:
+// V8 was already optimising the cons-string path well, and the saved
+// method-call cost just shifted attribution to the callers
+// (fastParseDictOneBuf / fastParseObject). Heap was flat too.
+//
+// This shim attacks the actual transient cost: each call builds a
+// throwaway string (cons-chain of ~8 chars on average, then flattened
+// on first use) only to hand it to PDFName.of, which hashes the string
+// against a Map<string, PDFName> and returns the cached instance.
+// 1.68 M calls × ~10-byte average × cons-string allocations + Map.get
+// hashing-the-string-again adds up to non-trivial heap throughput and
+// CPU even though the per-call work is small.
+//
+// PDF names are 4 787 unique on the book vs 1 681 225 calls -- 99.7 %
+// hit rate. So 99.7 % of those string allocations + Map hashings are
+// pure overhead: the answer was already computed, we just needed a
+// way to find it without rebuilding the key.
+//
+// The byte-cache. Keyed by `Uint8Array.prototype.hash`-ish value
+// (Java-style `hash * 31 + byte`), valued by the cached PDFName.
+// Each bucket stores `Entry` (single-entry, the common case for ~99 %
+// of buckets) or `Entry[]` (collision, vanishingly rare for the 4.8 k
+// unique names hashed into 2^32 space). Entry holds the bytes-key
+// (a small Uint8Array copy of the name body) for collision-check
+// equality.
+//
+// Cold path. On byte-cache miss, build the string via
+// `String.fromCharCode` (one allocation, not the per-byte cons chain
+// because we already have the full byte range from the scan) and
+// call the upstream `PDFName.of` -- which on this stack means
+// fast-decode-name's string-keyed cache, which returns the PDFName
+// (cache hit on the string side) or constructs it. Either way, the
+// PDFName instance gets cached in the byte-cache for next time.
+// Both caches converge on the same PDFName instance per logical name.
+//
+// Composes with fast-decode-name (their caches see different keys for
+// the same logical name; both return the same PDFName via this fall-
+// back chain). Direct `PDFName.of(...)` calls from non-parser code
+// (setOutline, setMetadata) bypass the byte-cache and go straight
+// through fast-decode-name -- correct, since those calls don't have
+// a byte range to work with.
+//
+// Side-effecting import. Import once before PDFDocument.load runs;
+// idempotent.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { IsWhitespace } = require('pdf-lib/cjs/core/syntax/Whitespace.js');
+const { IsDelimiter }  = require('pdf-lib/cjs/core/syntax/Delimiters.js');
+
+const FORWARD_SLASH = CharCodes.ForwardSlash;
+
+// hash -> Entry | Entry[]. Single-entry buckets store the Entry
+// directly; on collision we promote to an array. Entry shape is fixed
+// (bytes + name) so V8 gives it a stable hidden class.
+const byteCache = new Map();
+
+class Entry {
+  constructor(bytes, name) {
+    this.bytes = bytes;
+    this.name = name;
+  }
+}
+
+function _bytesEqual(a, buf, start, end) {
+  if (a.length !== end - start) return false;
+  for (let i = 0; i < a.length; i++) {
+    if (a[i] !== buf[start + i]) return false;
+  }
+  return true;
+}
+
+if (!PDFObjectParser.prototype.__fastParseNameInstalled) {
+  const orig = PDFObjectParser.prototype.parseName;
+
+  PDFObjectParser.prototype.parseName = function fastParseName() {
+    const stream = this.bytes;
+    const buf = stream.bytes;
+    const len = stream.length;
+    let idx = stream.idx;
+
+    // assertNext(ForwardSlash). Fall back on the unexpected path.
+    if (idx >= len || buf[idx] !== FORWARD_SLASH) {
+      return orig.call(this);
+    }
+    idx++;
+
+    // Scan body + compute hash in one pass. Java-style hashCode
+    // (`hash * 31 + byte`) -- monomorphic Smi math, no allocations.
+    const start = idx;
+    let hash = 0;
+    while (idx < len) {
+      const byte = buf[idx];
+      if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+      hash = (hash * 31 + byte) | 0;
+      idx++;
+    }
+    stream.idx = idx;
+
+    // Look up the byte-cache.
+    const bucket = byteCache.get(hash);
+    if (bucket !== undefined) {
+      if (bucket instanceof Entry) {
+        if (_bytesEqual(bucket.bytes, buf, start, idx)) return bucket.name;
+      } else {
+        // Collision: rare. Linear scan of the bucket.
+        for (let i = 0; i < bucket.length; i++) {
+          const e = bucket[i];
+          if (_bytesEqual(e.bytes, buf, start, idx)) return e.name;
+        }
+      }
+    }
+
+    // Miss. Build the lookup string in one shot (no cons-chain --
+    // String.fromCharCode handles bytes 0-255 directly) and route
+    // through the upstream PDFName.of (which on this stack is
+    // fast-decode-name's string-keyed cache). The resulting PDFName
+    // is the canonical instance; cache it in the byte-cache for next
+    // time so subsequent calls with the same bytes hit here.
+    const slice = buf.subarray(start, idx);
+    const name = PDFName.of(String.fromCharCode.apply(null, slice));
+    const key = new Uint8Array(slice);   // copy for stable cache key
+    const entry = new Entry(key, name);
+    if (bucket === undefined) {
+      byteCache.set(hash, entry);
+    } else if (bucket instanceof Entry) {
+      byteCache.set(hash, [bucket, entry]);
+    } else {
+      bucket.push(entry);
+    }
+    return name;
+  };
+
+  PDFObjectParser.prototype.__fastParseNameInstalled = true;
+}
diff --git a/docs/lib/fast-parse-number.mjs b/docs/lib/fast-parse-number.mjs
new file mode 100644
index 0000000..0f202d0
--- /dev/null
+++ b/docs/lib/fast-parse-number.mjs
@@ -0,0 +1,151 @@
+// Replace pdf-lib's BaseParser.parseRawNumber and BaseParser.parseRawInt
+// with direct-integer accumulators that skip per-byte string
+// concatenation, charFromCode calls, and the trailing Number()
+// string-parse round-trip.
+//
+// The upstream implementations
+// ([BaseParser.js:17 + :33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:17))
+// build `value` one character at a time via `value += charFromCode(byte)`,
+// then call `Number(value)` to convert the string back to a number,
+// then perform `isFinite` (and for parseRawNumber, MAX_SAFE_INTEGER)
+// guards on every call. Every numeric token in a PDF flows through
+// these paths: parseRawNumber via PDFObjectParser.parseNumberOrRef
+// (once per number, twice per indirect ref), parseRawInt via
+// PDFParser.parseIndirectObjectHeader (twice per indirect object) and
+// PDFObjectStreamParser (twice per object inside an ObjStm). On the
+// book this fires hundreds of thousands of times and allocates a
+// throwaway string per call.
+//
+// The fast path accumulates the integer directly (n = n*10 + (byte -
+// 0x30)). parseRawNumber additionally descends into decimal handling
+// when a period appears. Both fall back to the original for:
+//   - Numbers with > 15 integer digits (where direct accumulation
+//     could exceed Number.MAX_SAFE_INTEGER and lose precision).
+//   - Empty-digit cases (e.g., bare sign or lone "."), so upstream's
+//     NumberParsingError keeps its diagnostic context.
+// Both fallback paths are vanishingly rare on real PDFs.
+//
+// Mechanism: BaseParser isn't re-exported by pdf-lib's index, so we
+// import it via the package's CJS internal path through createRequire.
+// Mutating BaseParser.prototype affects every subclass (PDFParser,
+// PDFObjectParser, PDFObjectStreamParser, PDFXRefStreamParser).
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-number.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default;
+const { IsDigit } = require('pdf-lib/cjs/core/syntax/Numeric.js');
+
+const ZERO = 0x30;   // '0'
+const PERIOD = 0x2E; // '.'
+const PLUS = 0x2B;   // '+'
+const MINUS = 0x2D;  // '-'
+
+// Number.MAX_SAFE_INTEGER == 9007199254740991 (16 digits). 15-digit
+// integers are guaranteed to accumulate exactly without precision loss.
+const MAX_SAFE_INT_DIGITS = 15;
+
+if (!BaseParser.__fastParseNumberInstalled) {
+  const origParseRawNumber = BaseParser.prototype.parseRawNumber;
+  const origParseRawInt = BaseParser.prototype.parseRawInt;
+
+  BaseParser.prototype.parseRawInt = function fastParseRawInt() {
+    const bytes = this.bytes;
+    const start = bytes.offset();
+
+    let n = 0;
+    let digits = 0;
+    let byte = bytes.peek();
+    while (!bytes.done() && IsDigit[byte]) {
+      if (digits >= MAX_SAFE_INT_DIGITS) {
+        bytes.moveTo(start);
+        return origParseRawInt.call(this);
+      }
+      n = n * 10 + (byte - ZERO);
+      digits++;
+      bytes.next();
+      byte = bytes.peek();
+    }
+    if (digits === 0) {
+      bytes.moveTo(start);
+      return origParseRawInt.call(this);
+    }
+    return n;
+  };
+
+  BaseParser.prototype.parseRawNumber = function fastParseRawNumber() {
+    const bytes = this.bytes;
+    const start = bytes.offset();
+
+    // Sign
+    let byte = bytes.peek();
+    let neg = false;
+    if (byte === PLUS) {
+      bytes.next();
+      byte = bytes.peek();
+    } else if (byte === MINUS) {
+      neg = true;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    // Integer part
+    let intPart = 0;
+    let intDigits = 0;
+    while (!bytes.done() && IsDigit[byte]) {
+      if (intDigits >= MAX_SAFE_INT_DIGITS) {
+        // Precision risk -- rewind and delegate to upstream's Number()
+        // path, which retains correctly-rounded double precision and
+        // emits the spec-mandated warning above MAX_SAFE_INTEGER.
+        bytes.moveTo(start);
+        return origParseRawNumber.call(this);
+      }
+      intPart = intPart * 10 + (byte - ZERO);
+      intDigits++;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    if (byte !== PERIOD) {
+      if (intDigits === 0) {
+        // Empty number (e.g., bare sign with no digits). Rewind and
+        // let upstream throw NumberParsingError with full context.
+        bytes.moveTo(start);
+        return origParseRawNumber.call(this);
+      }
+      return neg ? -intPart : intPart;
+    }
+
+    // Consume period
+    bytes.next();
+    byte = bytes.peek();
+
+    // Decimal part
+    let frac = 0;
+    let scale = 1;
+    while (!bytes.done() && IsDigit[byte]) {
+      frac = frac * 10 + (byte - ZERO);
+      scale *= 10;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    if (intDigits === 0 && scale === 1) {
+      // Lone "." with no digits on either side. Rewind to let upstream
+      // throw NumberParsingError.
+      bytes.moveTo(start);
+      return origParseRawNumber.call(this);
+    }
+
+    const value = frac === 0 ? intPart : intPart + frac / scale;
+    return neg ? -value : value;
+  };
+
+  BaseParser.__fastParseNumberInstalled = true;
+}
diff --git a/docs/lib/fast-parse-object.mjs b/docs/lib/fast-parse-object.mjs
new file mode 100644
index 0000000..e573dc4
--- /dev/null
+++ b/docs/lib/fast-parse-object.mjs
@@ -0,0 +1,92 @@
+// Dispatch PDFObjectParser.parseObject by first byte; gate the three
+// keyword scans behind a byte check.
+//
+// The upstream parseObject
+// ([PDFObjectParser.js:36](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:36))
+// runs three speculative matchKeyword calls (true / false / null)
+// before peeking the dispatch byte:
+//
+//   parseObject() {
+//     this.skipWhitespaceAndComments();
+//     if (this.matchKeyword(Keywords.true))  return PDFBool.True;
+//     if (this.matchKeyword(Keywords.false)) return PDFBool.False;
+//     if (this.matchKeyword(Keywords.null))  return PDFNull;
+//     var byte = this.bytes.peek();
+//     ...
+//   }
+//
+// parseObject is called for every dict value, array element, and
+// indirect-object body -- same call density as fastParseDict, which
+// is the #2 row in the process profile. true / false / null are
+// extraordinarily rare in real PDFs (boolean / null entries on
+// individual dict values, mostly), so the three matchKeyword calls
+// fail-and-rewind on essentially every invocation. Each failure
+// still pays bytes.offset() + bytes.next() + comparison +
+// bytes.moveTo(initialOffset).
+//
+// This shim flips the dispatch: peek the first byte, branch by byte
+// for the structural tokens, and only enter matchKeyword when the
+// byte is `t` / `f` / `n` (i.e. could plausibly start the keyword).
+// Dispatch order is by observed frequency in dict-value position:
+// numbers / refs first (digits + sign + period), then dicts (<<),
+// names (/), arrays ([), strings ((), hex strings (<). Same
+// semantics -- a value starting with `t`/`f`/`n` that isn't a
+// keyword still falls through to the same PDFObjectParsingError
+// throw.
+//
+// Mechanism: PDFObjectParser isn't re-exported from pdf-lib's index,
+// so we reach in through the CJS internals via createRequire (same
+// shape as fast-parse-dict.mjs). Mutating
+// PDFObjectParser.prototype.parseObject is global -- every parser
+// instance created after this shim loads picks it up.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-object.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFBool         = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { Keywords }    = require('pdf-lib/cjs/core/syntax/Keywords.js');
+const { IsNumeric }   = require('pdf-lib/cjs/core/syntax/Numeric.js');
+const { PDFObjectParsingError } = require('pdf-lib/cjs/core/errors.js');
+
+const KwTrue  = Keywords.true;
+const KwFalse = Keywords.false;
+const KwNull  = Keywords.null;
+
+const LessThan          = CharCodes.LessThan;
+const ForwardSlash      = CharCodes.ForwardSlash;
+const LeftSquareBracket = CharCodes.LeftSquareBracket;
+const LeftParen         = CharCodes.LeftParen;
+const t_code            = CharCodes.t;
+const f_code            = CharCodes.f;
+const n_code            = CharCodes.n;
+
+if (!PDFObjectParser.prototype.__fastParseObjectInstalled) {
+  PDFObjectParser.prototype.parseObject = function fastParseObject() {
+    this.skipWhitespaceAndComments();
+    const bytes = this.bytes;
+    const byte = bytes.peek();
+    if (IsNumeric[byte]) return this.parseNumberOrRef();
+    if (byte === LessThan) {
+      if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+      return this.parseHexString();
+    }
+    if (byte === ForwardSlash)      return this.parseName();
+    if (byte === LeftSquareBracket) return this.parseArray();
+    if (byte === LeftParen)         return this.parseString();
+    if (byte === t_code && this.matchKeyword(KwTrue))  return PDFBool.True;
+    if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False;
+    if (byte === n_code && this.matchKeyword(KwNull))  return PDFNull;
+    throw new PDFObjectParsingError(bytes.position(), byte);
+  };
+
+  PDFObjectParser.prototype.__fastParseObjectInstalled = true;
+}
diff --git a/docs/lib/fast-pdfnumber-pool.mjs b/docs/lib/fast-pdfnumber-pool.mjs
new file mode 100644
index 0000000..b0ee999
--- /dev/null
+++ b/docs/lib/fast-pdfnumber-pool.mjs
@@ -0,0 +1,61 @@
+// Pool PDFNumber instances by value.
+//
+// After fast-refs / fast-indirect-objects / fast-dict-array shipped,
+// the residual heap profile attributed ~15 MB of self-size to
+// PDFObjectParser.parseNumberOrRef -- mostly inlined `new
+// PDFNumber(value)` calls (each of which also allocates a fresh
+// stringValue via `numberToString(value)`):
+//
+//     function PDFNumber(value) {
+//         var _this = _super.call(this) || this;
+//         _this.numberValue = value;
+//         _this.stringValue = numberToString(value);   // allocs
+//         return _this;
+//     }
+//     PDFNumber.of = function (value) { return new PDFNumber(value); };
+//
+// No pool. Every PDFNumber.of(N) returns a fresh instance, even
+// though PDFs are packed with repeated numeric values: page indices
+// 0..1651, /Count totals, /N object-stream lengths, common
+// /MediaBox dimensions (612, 792, 595, 842), font sizes, bit
+// widths. The book parses hundreds of thousands of PDFNumber.of
+// calls against a few thousand unique values.
+//
+// Shim. Dense array indexed by `value` for non-negative small
+// integers (0..POOL_SIZE-1, currently 16384 -- covers all observed
+// integer values in the book by a wide margin). Map fallback for
+// floats, negatives, and out-of-range integers. Same shape as
+// fast-refs on the PDFRef side. PDFNumber is immutable
+// (numberValue and stringValue are set in the constructor and never
+// mutated), so sharing instances is safe.
+//
+// Side-effecting import. Import once before any pdf-lib operation.
+// Idempotent.
+
+import { PDFNumber } from "pdf-lib";
+
+const POOL_SIZE = 16384;
+
+if (!PDFNumber.__fastPoolInstalled) {
+  const original = PDFNumber.of;
+  const intPool = new Array(POOL_SIZE);   // sparse, holes for unused slots
+  const otherPool = new Map();             // floats / negatives / large ints
+
+  PDFNumber.of = function fastNumberOf(value) {
+    // Hot path: non-negative integer within pool range.
+    if (value >= 0 && value < POOL_SIZE && (value | 0) === value) {
+      let pn = intPool[value];
+      if (pn !== undefined) return pn;
+      pn = original.call(PDFNumber, value);
+      intPool[value] = pn;
+      return pn;
+    }
+    // Cold path: Map cache. SameValueZero handles NaN / -0 correctly.
+    let pn = otherPool.get(value);
+    if (pn !== undefined) return pn;
+    pn = original.call(PDFNumber, value);
+    otherPool.set(value, pn);
+    return pn;
+  };
+  PDFNumber.__fastPoolInstalled = true;
+}
diff --git a/docs/lib/fast-refs-class.mjs b/docs/lib/fast-refs-class.mjs
new file mode 100644
index 0000000..c1c11e2
--- /dev/null
+++ b/docs/lib/fast-refs-class.mjs
@@ -0,0 +1,130 @@
+// fast-refs variant: use a class-style constructor for stable hidden class.
+//
+// fast-refs.mjs builds PDFRef instances with
+// `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.gen = ...`.
+// V8 treats objects built that way as transitioning through intermediate
+// hidden-class maps as each property is added, and the result is roughly
+// twice as large per instance as a `new`-built object with the same
+// fields. Empirically on the book, PDFRef sits at ~60 B/instance via
+// fast-refs whereas PDFName (built via `new PDFName(...)`) sits at ~31 B.
+//
+// This shim swaps the `Object.create + writes` pattern for a constructor
+// that sets both fields in one shot, giving V8 a stable hidden class
+// from the first instance.
+//
+// Two-shape variant: most PDFRefs on fresh-Chrome workloads are gen=0
+// and don't need to carry generationNumber at all. We allocate them via
+// _FastRef (single `objectNumber` inline slot) and let the prototype
+// supply a default `generationNumber = 0`. The rare gen!=0 path (PDF
+// spec allows it; our workload only hits it for the xref "free" entry
+// at object 0) uses _FastRefGen with both fields as own data properties.
+// V8 sees a bounded 2-shape polymorphism on PDFRef.prototype, and the
+// monomorphic hot path (gen=0 instances) keeps inline-field-read speed
+// for `.objectNumber` and `.generationNumber` reads -- no accessor-
+// property boundary to break inlining at upstream pdf-lib call sites
+// (PDFCrossRefSection.append, PDFCrossRefStream entry tuples,
+// PDFWriter.serializeToBuffer, our fast-indirect-objects shim, ...).
+//
+// Expected per-gen=0 instance: header (8 B) + 1 inline slot (4 B) = 12 B
+// raw, aligned to 16 B by V8 -- versus 12 + 2*4 = 20 B raw, aligned to
+// 24 B for a 2-slot instance. Saves 8 B per gen=0 PDFRef * ~226 k unique
+// = ~1.8 MB heap on the book.
+//
+// Mutually exclusive with --fast-refs in the harness.
+
+import { PDFRef } from 'pdf-lib';
+
+// ---- helpers (same as fast-refs.mjs, see commentary there) -------------
+
+function _writeUint(buffer, offset, n) {
+  if (n < 10) { buffer[offset] = 0x30 + n; return 1; }
+  let m = n, d = 0;
+  while (m > 0) { d++; m = (m / 10) | 0; }
+  for (let i = d - 1; i >= 0; i--) {
+    buffer[offset + i] = 0x30 + (n % 10);
+    n = (n / 10) | 0;
+  }
+  return d;
+}
+
+function _digitCount(n) {
+  if (n < 10)      return 1;
+  if (n < 100)     return 2;
+  if (n < 1000)    return 3;
+  if (n < 10000)   return 4;
+  if (n < 100000)  return 5;
+  if (n < 1000000) return 6;
+  let d = 0;
+  while (n > 0) { d++; n = (n / 10) | 0; }
+  return d;
+}
+
+// ---- the constructor-based fast PDFRef shapes --------------------------
+
+// gen=0 instances: single inline `objectNumber` slot. `generationNumber`
+// is supplied as a data-property default on PDFRef.prototype (set below),
+// so reads return 0 without any accessor dispatch.
+function _FastRef(objectNumber) {
+  this.objectNumber = objectNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+// gen!=0 instances: both fields as own data properties, shadowing the
+// prototype default. V8 sees a second hidden class -- bounded 2-shape
+// polymorphism, well-handled by inline caches.
+function _FastRefGen(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRefGen.prototype = PDFRef.prototype;
+
+if (!PDFRef.__fastRefsClassInstalled) {
+  const pool0 = [];                // dense gen=0 cache, indexed by objectNumber
+  const poolGenN = new Map();      // gen!=0 cache, keyed by "N M" string
+
+  PDFRef.of = function fastClassOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      const fresh = new _FastRef(objectNumber);
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    // gen != 0: this path is dead on fresh-Chrome workloads except for
+    // the xref "free" entry at object 0. Kept for spec correctness.
+    const key = objectNumber + ' ' + generationNumber;
+    const existing = poolGenN.get(key);
+    if (existing) return existing;
+    const fresh = new _FastRefGen(objectNumber, generationNumber);
+    poolGenN.set(key, fresh);
+    return fresh;
+  };
+
+  // Default generationNumber on the prototype. _FastRef instances inherit
+  // this (no own property); _FastRefGen instances shadow it with their
+  // own data property. Both look like data-property reads to V8's IC.
+  PDFRef.prototype.generationNumber = 0;
+
+  // Hot prototype methods read `objectNumber` / `generationNumber` as
+  // regular data properties. The upstream `tag` string is gone -- no
+  // instance carries it any more.
+  PDFRef.prototype.toString = function () {
+    return this.objectNumber + ' ' + this.generationNumber + ' R';
+  };
+
+  PDFRef.prototype.sizeInBytes = function () {
+    return _digitCount(this.objectNumber) + _digitCount(this.generationNumber) + 3;
+  };
+
+  PDFRef.prototype.copyBytesInto = function (buffer, offset) {
+    const start = offset;
+    offset += _writeUint(buffer, offset, this.objectNumber);
+    buffer[offset++] = 0x20;  // ' '
+    offset += _writeUint(buffer, offset, this.generationNumber);
+    buffer[offset++] = 0x20;  // ' '
+    buffer[offset++] = 0x52;  // 'R'
+    return offset - start;
+  };
+
+  PDFRef.__fastRefsClassInstalled = true;
+}
diff --git a/docs/lib/fast-refs.mjs b/docs/lib/fast-refs.mjs
new file mode 100644
index 0000000..beeb76a
--- /dev/null
+++ b/docs/lib/fast-refs.mjs
@@ -0,0 +1,140 @@
+// Replace pdf-lib's PDFRef.of pool lookup with a dense-array cache
+// for the generation=0 case (the overwhelmingly common one), AND
+// drop the per-instance `tag` string entirely.
+//
+// The upstream implementation
+// (node_modules/pdf-lib/cjs/core/objects/PDFRef.js) keys its pool by
+// a freshly-built string `<obj> <gen> R` on every call:
+//
+//   var tag = objectNumber + " " + generationNumber + " R";
+//   var instance = pool.get(tag);
+//
+// On the book we see ~1.2 M PDFRef.of calls per load, 82 % of them
+// with gen=0; each call allocates the tag string before Map.get can
+// hash it. That's ~330 ms of self-time on the process-phase profile
+// plus measurable GC pressure.
+//
+// Shim part 1: dense array indexed by objectNumber for the gen=0 branch.
+// Plain array indexing, no string alloc, no Map hash. On a gen=0 cache
+// miss we construct the PDFRef directly via
+// `Object.create(PDFRef.prototype)` plus manual field init, skipping
+// both the ENFORCER check and the upstream `pool.set(tag, instance)`.
+//
+// Shim part 2: drop the per-instance `tag` field. Upstream caches
+// `<obj> <gen> R` on each PDFRef so toString / sizeInBytes /
+// copyBytesInto can read it back. After fast-array-onebuf shipped,
+// the heap profile showed PDFParser.parseIndirectObjectHeader sitting
+// at 13.7 MB (25 % of total). The attribution chain (via
+// perf/find-heap-callers.mjs):
+//
+//   parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+//     → matchIndirectObjectHeader (try/catch wrapper)
+//       → parseIndirectObjectHeader → fastOf
+//
+// skipJibberish runs after every successful indirect object parse and
+// speculatively calls matchIndirectObjectHeader to detect the next
+// `N M obj` header. On valid PDFs the speculation always succeeds, so
+// fastOf fires once per indirect-object boundary, populating the
+// dense-array cache. The subsequent "real" parseIndirectObject then
+// hits the cache. V8 inlines fastOf at this call site (small + hot
+// from speculation) so the attribution lands on the caller -- 13.7 MB
+// of which was the tag-string allocation (`objectNumber + ' 0 R'`):
+// V8 builds 1-2 intermediate concat strings + the final ~25-35 B
+// tag, ~150 k times.
+//
+// Eliminating the `tag` field collapses all of that. The prototype
+// methods now compute their results from objectNumber / generationNumber
+// directly. copyBytesInto writes digits straight into the output buffer
+// with a no-allocation _writeUint helper; sizeInBytes returns
+// digitCount(obj) + digitCount(gen) + 3 (for " " + " R"); toString
+// builds on demand (only used for debug, no caching needed).
+//
+// gen != 0 PDFRefs constructed via the upstream path still have `tag`
+// set by the upstream constructor -- our overrides ignore the field,
+// so the tag string is allocated-then-wasted. gen != 0 is ~18 % of refs
+// at ~50 K instances; the waste is bounded and not worth patching the
+// constructor for.
+//
+// gen != 0 cache lookups (pdf-lib's xref-stream bookkeeping where
+// "generation" encodes an in-ObjStm index per PDF 1.5 spec, see
+// PDFXRefStreamParser.js:74-80) still pass through the original
+// PDFRef.of -- their Map pool is harmless at gen!=0's volume.
+//
+// Side-effecting import. Import once before any pdf-lib operation.
+// Idempotent.
+
+import { PDFRef } from "pdf-lib";
+
+// Write n's decimal representation into buffer starting at offset.
+// No allocations. Returns the number of bytes written. n must be a
+// non-negative integer.
+function _writeUint(buffer, offset, n) {
+  if (n < 10) { buffer[offset] = 0x30 + n; return 1; }
+  // Count digits.
+  let m = n, d = 0;
+  while (m > 0) { d++; m = (m / 10) | 0; }
+  // Write digits backwards.
+  for (let i = d - 1; i >= 0; i--) {
+    buffer[offset + i] = 0x30 + (n % 10);
+    n = (n / 10) | 0;
+  }
+  return d;
+}
+
+// Non-allocating decimal digit count for non-negative integers.
+// Ladder catches the common small-number cases without arithmetic.
+function _digitCount(n) {
+  if (n < 10)      return 1;
+  if (n < 100)     return 2;
+  if (n < 1000)    return 3;
+  if (n < 10000)   return 4;
+  if (n < 100000)  return 5;
+  if (n < 1000000) return 6;
+  let d = 0;
+  while (n > 0) { d++; n = (n / 10) | 0; }
+  return d;
+}
+
+if (!PDFRef.__fastPoolInstalled) {
+  const original = PDFRef.of;
+  const pool0 = [];
+  PDFRef.of = function fastOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      // Direct construction -- skip ENFORCER check, skip upstream pool.set,
+      // skip the per-instance `tag` string (the prototype methods now
+      // compute their results from objectNumber / generationNumber).
+      const fresh = Object.create(PDFRef.prototype);
+      fresh.objectNumber = objectNumber;
+      fresh.generationNumber = 0;
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    return original.call(PDFRef, objectNumber, generationNumber);
+  };
+
+  // Replace the upstream prototype methods to ignore `tag` entirely.
+  // Works for both gen=0 (tag is absent) and gen!=0 (tag is set by
+  // upstream's constructor but ignored).
+
+  PDFRef.prototype.toString = function () {
+    return this.objectNumber + ' ' + this.generationNumber + ' R';
+  };
+
+  PDFRef.prototype.sizeInBytes = function () {
+    return _digitCount(this.objectNumber) + _digitCount(this.generationNumber) + 3;
+  };
+
+  PDFRef.prototype.copyBytesInto = function (buffer, offset) {
+    const start = offset;
+    offset += _writeUint(buffer, offset, this.objectNumber);
+    buffer[offset++] = 0x20;  // ' '
+    offset += _writeUint(buffer, offset, this.generationNumber);
+    buffer[offset++] = 0x20;  // ' '
+    buffer[offset++] = 0x52;  // 'R'
+    return offset - start;
+  };
+
+  PDFRef.__fastPoolInstalled = true;
+}
diff --git a/docs/lib/fast-size-in-bytes.mjs b/docs/lib/fast-size-in-bytes.mjs
new file mode 100644
index 0000000..779ade4
--- /dev/null
+++ b/docs/lib/fast-size-in-bytes.mjs
@@ -0,0 +1,62 @@
+// Replace pdf-lib's utils.sizeInBytes -- which allocates a base-2 string
+// just to count its bit length -- with a non-allocating short-circuit
+// ladder.
+//
+// The upstream sizeInBytes
+// ([numbers.js:37](node_modules/pdf-lib/cjs/utils/numbers.js:37)) is:
+//
+//   exports.sizeInBytes = function (n) {
+//       return Math.ceil(n.toString(2).length / 8);
+//   };
+//
+// It's called from PDFCrossRefStream.computeMaxEntryByteWidths (three
+// calls per xref entry, ~50 k entries on the book) and from
+// utils.bytesFor (to size the Uint8Array before filling it byte-by-
+// byte, called from PDFCrossRefStream.getUnencodedContents). Both
+// paths are part of writing the cross-reference stream.
+//
+// For the xref values the distribution is heavily skewed small: type
+// is always 0/1/2 (1 byte), generationNumber is always 0 (1 byte),
+// object-stream indices are small (1-2 bytes), and file offsets are
+// 3-4 bytes for any sub-4GB PDF. A short-circuit ladder catches the
+// dominant cases in one compare; the rare 5+ byte tail falls through
+// to a Math.clz32-based fallback that's still allocation-free.
+//
+// Why patch three places (and why bytesFor isn't on the list):
+// pdf-lib ships compiled against tslib 1.x, whose `__exportStar`
+// does a value-copy (`exports[p] = m[p]`) rather than installing a
+// live getter. So consumers that read sizeInBytes through a barrel
+// (`utils_1.sizeInBytes(...)` from PDFCrossRefStream) hold a
+// captured reference and won't see a mutation of `numbers.sizeInBytes`
+// alone. Patch all three barrel layers (utils/numbers, utils/index,
+// top-level index) to cover every observed call site. utils.bytesFor
+// reads `exports.sizeInBytes` at call time from the same module
+// object we mutate first, so it picks up the fast path without a
+// separate patch.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-size-in-bytes.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const numbers     = require('pdf-lib/cjs/utils/numbers.js');
+const utilsBarrel = require('pdf-lib/cjs/utils/index.js');
+const topBarrel   = require('pdf-lib/cjs/index.js');
+
+if (!numbers.__fastSizeInBytesInstalled) {
+  const fastSizeInBytes = function fastSizeInBytes(n) {
+    if (n < 0x100) return 1;
+    if (n < 0x10000) return 2;
+    if (n < 0x1000000) return 3;
+    if (n < 0x100000000) return 4;
+    return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8);
+  };
+  numbers.sizeInBytes     = fastSizeInBytes;
+  utilsBarrel.sizeInBytes = fastSizeInBytes;
+  topBarrel.sizeInBytes   = fastSizeInBytes;
+  numbers.__fastSizeInBytesInstalled = true;
+}
diff --git a/docs/lib/fast-sync-load.mjs b/docs/lib/fast-sync-load.mjs
new file mode 100644
index 0000000..1109247
--- /dev/null
+++ b/docs/lib/fast-sync-load.mjs
@@ -0,0 +1,345 @@
+// Strip pdf-lib's parseSpeed / objectsPerTick / shouldWaitForTick /
+// waitForTick machinery entirely. Synchronify everywhere the conditional
+// yield was the only async thing in the method.
+//
+// pdf-lib's parser and writers are downlevel-compiled from TypeScript
+// `async function` to tslib's __awaiter + __generator state machine,
+// so on browsers they can yield to the event loop every
+// `objectsPerTick` objects via `await waitForTick()`. In Node with
+// objectsPerTick: Infinity (which parseSpeed: Fastest historically
+// set on the load side) the gate never fires -- the entire generator
+// runs in one tick -- yet every indirect object (~50 k on the book)
+// still pays the state-machine dispatch + Promise allocation for a
+// single fall-through `case 0`.
+//
+// Eight methods participate in this pattern; this shim replaces all
+// of them with synchronous (or, where a legitimate await remains,
+// awaiterless `async`) twins:
+//
+//   Load side (parser):
+//     PDFParser.prototype.parseDocument
+//     PDFParser.prototype.parseDocumentSection
+//     PDFParser.prototype.parseIndirectObjects
+//     PDFParser.prototype.parseIndirectObject
+//     PDFObjectStreamParser.prototype.parseIntoContext
+//     PDFDocument.load   (static; only awaited parseDocument)
+//
+//   Save side (writers):
+//     PDFWriter.prototype.serializeToBuffer
+//       (kept `async` because the inherited path awaits the
+//        ParallelStreamWriter override of computeBufferSize, which
+//        does genuine Promise.all-driven libuv-pool concurrency)
+//     PDFWriter.prototype.computeBufferSize
+//     PDFStreamWriter.prototype.computeBufferSize
+//
+// The load-side patches have to land together: each method awaits
+// the next one down, so desugaring any one in isolation still leaves
+// a Promise chain dangling.
+//
+// PDFDocument.load's signature is preserved (still callable as
+// `await PDFDocument.load(bytes)`; awaiting a non-Promise resolves
+// to the value), so existing call sites need no change. The
+// parseSpeed option is silently ignored. parallel-deflate.mjs's
+// parallelSave drops `objectsPerTick` from its public API in step
+// with this shim.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-sync-load.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFParser              = require('pdf-lib/cjs/core/parser/PDFParser.js').default;
+const PDFObjectStreamParser  = require('pdf-lib/cjs/core/parser/PDFObjectStreamParser.js').default;
+const PDFXRefStreamParser    = require('pdf-lib/cjs/core/parser/PDFXRefStreamParser.js').default;
+const PDFRawStream           = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFRef                 = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFName                = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNumber              = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFStream              = require('pdf-lib/cjs/core/objects/PDFStream.js').default;
+const PDFInvalidObject       = require('pdf-lib/cjs/core/objects/PDFInvalidObject.js').default;
+const PDFDocument            = require('pdf-lib/cjs/api/PDFDocument.js').default;
+const PDFWriter              = require('pdf-lib/cjs/core/writers/PDFWriter.js').default;
+const PDFStreamWriter        = require('pdf-lib/cjs/core/writers/PDFStreamWriter.js').default;
+const PDFHeader              = require('pdf-lib/cjs/core/document/PDFHeader.js').default;
+const PDFTrailer             = require('pdf-lib/cjs/core/document/PDFTrailer.js').default;
+const PDFTrailerDict         = require('pdf-lib/cjs/core/document/PDFTrailerDict.js').default;
+const PDFCrossRefSection     = require('pdf-lib/cjs/core/document/PDFCrossRefSection.js').default;
+const PDFCrossRefStream      = require('pdf-lib/cjs/core/structures/PDFCrossRefStream.js').default;
+const PDFObjectStream        = require('pdf-lib/cjs/core/structures/PDFObjectStream.js').default;
+const CharCodes              = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { ReparseError, StalledParserError } = require('pdf-lib/cjs/core/errors.js');
+const { IsDigit }            = require('pdf-lib/cjs/core/syntax/Numeric.js');
+const { Keywords }           = require('pdf-lib/cjs/core/syntax/Keywords.js');
+const { toUint8Array, copyStringIntoBuffer, last } = require('pdf-lib/cjs/utils/index.js');
+
+// Pool-deduped PDFName instances are reference-stable for the whole
+// load (see fast-parse-dict.mjs for the same trick). Capture the three
+// sentinels parseIndirectObject's Type-dispatch needs.
+const TypeName   = PDFName.of('Type');
+const ObjStmName = PDFName.of('ObjStm');
+const XRefName   = PDFName.of('XRef');
+const RefZero    = PDFRef.of(0);
+const SizeName   = PDFName.of('Size');
+
+if (!PDFParser.prototype.__fastSyncLoadInstalled) {
+
+  // ----- Load side ---------------------------------------------------
+
+  PDFParser.prototype.parseDocument = function parseDocumentSync() {
+    if (this.alreadyParsed) {
+      throw new ReparseError('PDFParser', 'parseDocument');
+    }
+    this.alreadyParsed = true;
+    this.context.header = this.parseHeader();
+
+    let prevOffset;
+    while (!this.bytes.done()) {
+      this.parseDocumentSection();
+      const offset = this.bytes.offset();
+      if (offset === prevOffset) {
+        throw new StalledParserError(this.bytes.position());
+      }
+      prevOffset = offset;
+    }
+
+    this.maybeRecoverRoot();
+    if (this.context.lookup(RefZero)) {
+      console.warn('Removing parsed object: 0 0 R');
+      this.context.delete(RefZero);
+    }
+    return this.context;
+  };
+
+  PDFParser.prototype.parseDocumentSection = function parseDocumentSectionSync() {
+    this.parseIndirectObjects();
+    this.maybeParseCrossRefSection();
+    this.maybeParseTrailerDict();
+    this.maybeParseTrailer();
+    this.skipJibberish();
+  };
+
+  PDFParser.prototype.parseIndirectObjects = function parseIndirectObjectsSync() {
+    this.skipWhitespaceAndComments();
+    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
+      const initialOffset = this.bytes.offset();
+      try {
+        this.parseIndirectObject();
+      } catch (e) {
+        this.bytes.moveTo(initialOffset);
+        this.tryToParseInvalidIndirectObject();
+      }
+      this.skipWhitespaceAndComments();
+      // Fast path: on valid PDFs the next byte is almost always a digit
+      // (start of the next `N M obj` header). skipJibberish only exists
+      // to recover from invalid PDFs that wedge garbage between indirect
+      // objects, but its hot path -- 150 k calls per load on the book --
+      // speculatively runs matchKeyword(xref/trailer/startxref) (all fail
+      // on a digit) and then matchIndirectObjectHeader (a try/catch
+      // around parseIndirectObjectHeader + parseRawInt x2 + matchKeyword
+      // + fastOf round-trip). All to confirm what the outer while's
+      // IsDigit check already knew. Short-circuit when the cursor is on
+      // a digit; fall through to skipJibberish on anything else
+      // (xref / trailer / startxref keyword starts, or real jibberish).
+      // The once-per-section skipJibberish in parseDocumentSection
+      // (after maybeParseTrailer) is unaffected.
+      if (!this.bytes.done() && IsDigit[this.bytes.peek()]) continue;
+      this.skipJibberish();
+    }
+  };
+
+  PDFParser.prototype.parseIndirectObject = function parseIndirectObjectSync() {
+    const ref = this.parseIndirectObjectHeader();
+    this.skipWhitespaceAndComments();
+    const object = this.parseObject();
+    this.skipWhitespaceAndComments();
+    this.matchKeyword(Keywords.endobj);
+    if (object instanceof PDFRawStream &&
+        object.dict.lookup(TypeName) === ObjStmName) {
+      PDFObjectStreamParser.forStream(object).parseIntoContext();
+    } else if (object instanceof PDFRawStream &&
+               object.dict.lookup(TypeName) === XRefName) {
+      PDFXRefStreamParser.forStream(object).parseIntoContext();
+    } else {
+      this.context.assign(ref, object);
+    }
+    return ref;
+  };
+
+  PDFObjectStreamParser.prototype.parseIntoContext = function parseIntoContextSync() {
+    if (this.alreadyParsed) {
+      throw new ReparseError('PDFObjectStreamParser', 'parseIntoContext');
+    }
+    this.alreadyParsed = true;
+    const offsetsAndObjectNumbers = this.parseOffsetsAndObjectNumbers();
+    for (let i = 0, len = offsetsAndObjectNumbers.length; i < len; i++) {
+      const entry = offsetsAndObjectNumbers[i];
+      this.bytes.moveTo(this.firstOffset + entry.offset);
+      const object = this.parseObject();
+      const ref = PDFRef.of(entry.objectNumber, 0);
+      this.context.assign(ref, object);
+    }
+  };
+
+  // PDFDocument.load only awaited parseDocument(); now that's sync, the
+  // outer __awaiter is wasted too. Drop it. Signature unchanged --
+  // `await PDFDocument.load(...)` on a non-Promise resolves to the value.
+  // The parseSpeed option is silently ignored (no more yield gate to tune).
+  PDFDocument.load = function loadSync(pdf, options) {
+    if (options === undefined) options = {};
+    const ignoreEncryption      = options.ignoreEncryption      === undefined ? false : options.ignoreEncryption;
+    const throwOnInvalidObject  = options.throwOnInvalidObject  === undefined ? false : options.throwOnInvalidObject;
+    const updateMetadata        = options.updateMetadata        === undefined ? true  : options.updateMetadata;
+    const capNumbers            = options.capNumbers            === undefined ? false : options.capNumbers;
+    const bytes = toUint8Array(pdf);
+    const context = PDFParser.forBytesWithOptions(
+      bytes, Infinity, throwOnInvalidObject, capNumbers,
+    ).parseDocument();
+    return new PDFDocument(context, ignoreEncryption, updateMetadata);
+  };
+
+  // ----- Save side ---------------------------------------------------
+
+  // PDFWriter.serializeToBuffer awaits computeBufferSize, which in our
+  // pipeline is the ParallelStreamWriter override -- genuinely async
+  // because of `await Promise.all(deflated)` over libuv's thread pool.
+  // So the wrapper stays async. The conditional waitForTick yield in
+  // its main loop is the only piece we strip.
+  PDFWriter.prototype.serializeToBuffer = async function serializeToBufferSync() {
+    const { size, header, indirectObjects, xref, trailerDict, trailer } =
+      await this.computeBufferSize();
+    const buffer = new Uint8Array(size);
+    let offset = 0;
+    offset += header.copyBytesInto(buffer, offset);
+    buffer[offset++] = CharCodes.Newline;
+    buffer[offset++] = CharCodes.Newline;
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      const object = indirectObject[1];
+      offset += copyStringIntoBuffer(String(ref.objectNumber), buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += copyStringIntoBuffer(String(ref.generationNumber), buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      buffer[offset++] = CharCodes.o;
+      buffer[offset++] = CharCodes.b;
+      buffer[offset++] = CharCodes.j;
+      buffer[offset++] = CharCodes.Newline;
+      offset += object.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.e;
+      buffer[offset++] = CharCodes.n;
+      buffer[offset++] = CharCodes.d;
+      buffer[offset++] = CharCodes.o;
+      buffer[offset++] = CharCodes.b;
+      buffer[offset++] = CharCodes.j;
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.Newline;
+    }
+    if (xref) {
+      offset += xref.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    if (trailerDict) {
+      offset += trailerDict.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.Newline;
+    }
+    offset += trailer.copyBytesInto(buffer, offset);
+    return buffer;
+  };
+
+  // PDFWriter.computeBufferSize -- the basic (non-stream) writer's
+  // sizing pass. Not on our pipeline's hot path (we route through
+  // PDFStreamWriter via ParallelStreamWriter, both of which override
+  // this method) but patched for consistency: the only async thing
+  // upstream is the conditional waitForTick yield in its loop.
+  PDFWriter.prototype.computeBufferSize = function computeBufferSizeBaseSync() {
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xref = PDFCrossRefSection.create();
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      xref.addEntry(ref, size);
+      size += this.computeIndirectObjectSize(indirectObject);
+    }
+    const xrefOffset = size;
+    size += xref.sizeInBytes() + 1;
+    const trailerDict = PDFTrailerDict.of(this.createTrailerDict());
+    size += trailerDict.sizeInBytes() + 2;
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+    return { size, header, indirectObjects, xref, trailerDict, trailer };
+  };
+
+  // PDFStreamWriter.computeBufferSize -- the upstream stream writer's
+  // sizing pass with two waitForTick gates (one per loop). Not on our
+  // pipeline's hot path (ParallelStreamWriter overrides this with its
+  // own three-phase parallel-deflate version) but patched for
+  // consistency. Logic mirrors the upstream method body exactly.
+  PDFStreamWriter.prototype.computeBufferSize = function computeBufferSizeStreamSync() {
+    let objectNumber = this.context.largestObjectNumber + 1;
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xrefStream = PDFCrossRefStream.create(this.createTrailerDict(), this.encodeStreams);
+
+    const uncompressedObjects = [];
+    const compressedObjects = [];
+    const objectStreamRefs = [];
+
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      const object = indirectObject[1];
+      const shouldNotCompress =
+        ref === this.context.trailerInfo.Encrypt ||
+        object instanceof PDFStream ||
+        object instanceof PDFInvalidObject ||
+        ref.generationNumber !== 0;
+      if (shouldNotCompress) {
+        uncompressedObjects.push(indirectObject);
+        xrefStream.addUncompressedEntry(ref, size);
+        size += this.computeIndirectObjectSize(indirectObject);
+      } else {
+        let chunk = last(compressedObjects);
+        let objectStreamRef = last(objectStreamRefs);
+        if (!chunk || chunk.length % this.objectsPerStream === 0) {
+          chunk = [];
+          compressedObjects.push(chunk);
+          objectStreamRef = PDFRef.of(objectNumber++);
+          objectStreamRefs.push(objectStreamRef);
+        }
+        xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length);
+        chunk.push(indirectObject);
+      }
+    }
+
+    for (let idx = 0, len = compressedObjects.length; idx < len; idx++) {
+      const chunk = compressedObjects[idx];
+      const ref = objectStreamRefs[idx];
+      const objectStream = PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams);
+      xrefStream.addUncompressedEntry(ref, size);
+      size += this.computeIndirectObjectSize([ref, objectStream]);
+      uncompressedObjects.push([ref, objectStream]);
+    }
+
+    const xrefStreamRef = PDFRef.of(objectNumber++);
+    xrefStream.dict.set(SizeName, PDFNumber.of(objectNumber));
+    xrefStream.addUncompressedEntry(xrefStreamRef, size);
+    const xrefOffset = size;
+    size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]);
+    uncompressedObjects.push([xrefStreamRef, xrefStream]);
+
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+    return { size, header, indirectObjects: uncompressedObjects, trailer };
+  };
+
+  PDFParser.prototype.__fastSyncLoadInstalled = true;
+}
diff --git a/docs/lib/measure-pass.mjs b/docs/lib/measure-pass.mjs
new file mode 100644
index 0000000..293e688
--- /dev/null
+++ b/docs/lib/measure-pass.mjs
@@ -0,0 +1,600 @@
+// No-allocate measure pass over a PDF byte stream.
+//
+// Walks the PDF grammar (indirect objects, dicts, arrays, names,
+// numbers, refs, strings, streams, ObjStms-with-inflate) without
+// instantiating any PDFObject. Produces counts that downstream
+// pre-sizing shims consume:
+//
+//   { indirectObjects, dicts, dictSlots, arrays, arraySlots,
+//     refs, names, numbers, strings, hexStrings, streams,
+//     objStms, objStmInner, maxDictSlots, maxArraySlots,
+//     maxRecursion, totalStreamBytes, totalInflatedBytes }
+//
+// Counts are *appearances*, not unique values. Phase 2+ will add
+// interning to produce unique-count tables (for exact name/ref/
+// number pool sizing).
+//
+// Allocation discipline:
+//   - No string concat. Names, numbers, strings are skipped by
+//     advancing the byte cursor without keeping bytes.
+//   - Per-dict captures (/Length, /Type, /N, /First) live on
+//     depth-indexed typed-array stacks. Max recursion observed
+//     on the book is 4; stack size 64 is plenty.
+//   - ObjStm offset arrays are reusable Int32Array(512), grown
+//     on demand. The inflate destination is a fresh Buffer per
+//     ObjStm (Chrome's raw output has zero ObjStms; book.pdf
+//     has 453 after pdf-lib's save bundles them).
+//
+// One PDF parse-corner to remember: PDF reals can omit the
+// integer part. `.251` is valid (Chrome emits it for /CA, /ca
+// alpha values). The parser accepts `[sign?][digits?]
+// [.[digits?]]?` with the constraint that at least one digit
+// appears.
+
+import { inflateSync } from 'node:zlib';
+
+// ---- Byte constants -------------------------------------------------
+
+const TAB = 9, LF = 10, FF = 12, CR = 13, SP = 32;
+const LT = 60 /* < */, GT = 62 /* > */;
+const LB = 91 /* [ */, RB = 93 /* ] */;
+const LP = 40 /* ( */, RP = 41 /* ) */;
+const SLASH = 47, PERCENT = 37, BACKSLASH = 92;
+const D0 = 48, D9 = 57;
+const MINUS = 45, PLUS = 43, DOT = 46;
+const a_ = 97, b_ = 98, d_ = 100, e_ = 101, f_ = 102, j_ = 106;
+const l_ = 108, m_ = 109, n_ = 110, o_ = 111, r_ = 114, s_ = 115;
+const t_ = 116, u_ = 117, x_ = 120;
+const R_CH = 82, L_CH = 76, T_CH = 84, N_CH = 78, F_CH = 70;
+
+// ---- Lookup tables (mirror pdf-lib's IsWhitespace / IsDelimiter / IsDigit / IsNumeric) ----
+
+const IsWS = new Uint8Array(256);
+IsWS[0] = IsWS[TAB] = IsWS[LF] = IsWS[FF] = IsWS[CR] = IsWS[SP] = 1;
+
+const IsDelim = new Uint8Array(256);
+IsDelim[LT] = IsDelim[GT] = IsDelim[LB] = IsDelim[RB] = 1;
+IsDelim[LP] = IsDelim[RP] = IsDelim[SLASH] = IsDelim[PERCENT] = 1;
+
+const IsDigit = new Uint8Array(256);
+for (let b = D0; b <= D9; b++) IsDigit[b] = 1;
+
+const IsNumeric = new Uint8Array(IsDigit);
+IsNumeric[DOT] = IsNumeric[MINUS] = IsNumeric[PLUS] = 1;
+
+// ---- Measurer -------------------------------------------------------
+
+export class Measurer {
+  constructor(buf) {
+    this.buf = buf;
+    this.pos = 0;
+    this._len = buf.length;
+
+    this.numIndirectObjects = 0;
+    this.numDicts = 0;
+    this.numDictSlots = 0;
+    this.numArrays = 0;
+    this.numArraySlots = 0;
+    this.numRefs = 0;
+    this.numNames = 0;
+    this.numNumbers = 0;
+    this.numStrings = 0;
+    this.numHexStrings = 0;
+    this.numStreams = 0;
+    this.numObjStms = 0;
+    this.numObjStmInnerObjects = 0;
+    this.maxDictSlots = 0;
+    this.maxArraySlots = 0;
+    this.maxRecursionDepth = 0;
+    this.totalStreamBytes = 0;
+    this.totalInflatedBytes = 0;
+
+    const MAX_DEPTH = 64;
+    this._depth = 0;
+    this._stLength  = new Int32Array(MAX_DEPTH);
+    this._stIsObjStm = new Uint8Array(MAX_DEPTH);
+    this._stN      = new Int32Array(MAX_DEPTH);
+    this._stFirst  = new Int32Array(MAX_DEPTH);
+
+    this._objNums    = new Int32Array(512);
+    this._objOffsets = new Int32Array(512);
+  }
+
+  // ---- Skip helpers (no allocation) --------------------------------
+
+  skipWS() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b]) { p++; continue; }
+      if (b === PERCENT) {
+        while (p < len && buf[p] !== LF && buf[p] !== CR) p++;
+        continue;
+      }
+      break;
+    }
+    this.pos = p;
+  }
+
+  // Parse an integer in place. No string concat. Returns NaN if no digit.
+  // Does NOT bump numNumbers (used for metadata: header, ObjStm offsets).
+  _skipInt() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos, v = 0, sign = 1, any = 0;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    while (p < len) {
+      const b = buf[p];
+      if (b < D0 || b > D9) break;
+      v = v * 10 + (b - D0);
+      any = 1; p++;
+    }
+    this.pos = p;
+    return any ? sign * v : NaN;
+  }
+
+  _skipNameBody() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b] || IsDelim[b]) break;
+      p++;
+    }
+    this.pos = p;
+  }
+
+  skipName() {
+    this.pos++;
+    this._skipNameBody();
+    this.numNames++;
+  }
+
+  skipString() {
+    this.pos++;
+    const buf = this.buf, len = this._len;
+    let p = this.pos, depth = 1;
+    while (p < len && depth > 0) {
+      const b = buf[p];
+      if (b === BACKSLASH) { p += 2; continue; }
+      if (b === LP) depth++;
+      else if (b === RP) depth--;
+      p++;
+    }
+    this.pos = p;
+    this.numStrings++;
+  }
+
+  skipHexString() {
+    this.pos++;
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len && buf[p] !== GT) p++;
+    p++;
+    this.pos = p;
+    this.numHexStrings++;
+  }
+
+  // Skip /name; tag whether it matched a known stream-related key.
+  // 0=other, 1=Length, 2=Type, 3=N, 4=First.
+  matchDictKey() {
+    const buf = this.buf, len = this._len;
+    this.pos++;
+    const start = this.pos;
+    let match = 0;
+    const b0 = buf[start];
+    if (b0 === L_CH) {
+      if (start + 6 <= len &&
+          buf[start+1] === e_ && buf[start+2] === n_ &&
+          buf[start+3] === 103 /* g */ && buf[start+4] === t_ &&
+          buf[start+5] === 104 /* h */ &&
+          (start+6 === len || IsWS[buf[start+6]] || IsDelim[buf[start+6]])) {
+        match = 1; this.pos = start + 6;
+      }
+    } else if (b0 === T_CH) {
+      if (start + 4 <= len &&
+          buf[start+1] === 121 /* y */ && buf[start+2] === 112 /* p */ &&
+          buf[start+3] === e_ &&
+          (start+4 === len || IsWS[buf[start+4]] || IsDelim[buf[start+4]])) {
+        match = 2; this.pos = start + 4;
+      }
+    } else if (b0 === N_CH) {
+      if (start + 1 === len || IsWS[buf[start+1]] || IsDelim[buf[start+1]]) {
+        match = 3; this.pos = start + 1;
+      }
+    } else if (b0 === F_CH) {
+      if (start + 5 <= len &&
+          buf[start+1] === 105 /* i */ && buf[start+2] === r_ &&
+          buf[start+3] === s_ && buf[start+4] === t_ &&
+          (start+5 === len || IsWS[buf[start+5]] || IsDelim[buf[start+5]])) {
+        match = 4; this.pos = start + 5;
+      }
+    }
+    if (match === 0) this._skipNameBody();
+    this.numNames++;
+    return match;
+  }
+
+  // After / is already skipped, check if name body equals an ASCII string.
+  // Does NOT move pos.
+  _isNameAt(p, name) {
+    const buf = this.buf, len = this._len;
+    const n = name.length;
+    if (p + n > len) return false;
+    for (let i = 0; i < n; i++) {
+      if (buf[p + i] !== name.charCodeAt(i)) return false;
+    }
+    if (p + n === len) return true;
+    const after = buf[p + n];
+    return !!(IsWS[after] || IsDelim[after]);
+  }
+
+  // ---- Number / Ref ------------------------------------------------
+
+  // PDF number grammar: optional sign, optional digits, optional dot,
+  // optional digits. At least one digit required somewhere. No exps.
+  // Returns the integer value for pure-integer-non-ref case (for
+  // /Length capture); else NaN.
+  parseNumberOrRefCapture() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    let sign = 1;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    let intDigits = 0, intVal = 0;
+    while (p < len && buf[p] >= D0 && buf[p] <= D9) {
+      intVal = intVal * 10 + (buf[p] - D0);
+      intDigits++; p++;
+    }
+    let hasDot = 0, fracDigits = 0;
+    if (p < len && buf[p] === DOT) {
+      hasDot = 1; p++;
+      while (p < len && buf[p] >= D0 && buf[p] <= D9) { fracDigits++; p++; }
+    }
+    if (intDigits === 0 && fracDigits === 0) {
+      throw new Error('measure-pass: expected number at ' + this.pos);
+    }
+    this.pos = p;
+    if (hasDot) {
+      this.numNumbers++;
+      return NaN;
+    }
+    const save = this.pos;
+    this.skipWS();
+    if (this.pos < len && IsDigit[buf[this.pos]]) {
+      this._skipInt();
+      this.skipWS();
+      if (this.pos < len && buf[this.pos] === R_CH) {
+        this.pos++;
+        this.numRefs++;
+        return NaN;
+      }
+    }
+    this.pos = save;
+    this.numNumbers++;
+    return sign * intVal;
+  }
+
+  // ---- Object dispatch --------------------------------------------
+
+  parseObject() {
+    this.skipWS();
+    const buf = this.buf, len = this._len;
+    if (this.pos >= len) return;
+    const b = buf[this.pos];
+
+    if (b === t_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === r_ && buf[this.pos+2] === u_ && buf[this.pos+3] === e_) {
+        this.pos += 4; return;
+      }
+    } else if (b === f_) {
+      if (this.pos + 5 <= len &&
+          buf[this.pos+1] === a_ && buf[this.pos+2] === l_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === e_) {
+        this.pos += 5; return;
+      }
+    } else if (b === n_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === u_ && buf[this.pos+2] === l_ && buf[this.pos+3] === l_) {
+        this.pos += 4; return;
+      }
+    }
+
+    if (b === LT) {
+      if (buf[this.pos + 1] === LT) {
+        const d = this._depth;
+        this.parseDict();
+        this._depth = d;
+        return;
+      }
+      this.skipHexString();
+      return;
+    }
+    if (b === LP) { this.skipString(); return; }
+    if (b === SLASH) { this.skipName(); return; }
+    if (b === LB) { this.parseArray(); return; }
+    if (IsNumeric[b]) { this.parseNumberOrRefCapture(); return; }
+
+    throw new Error(`measure-pass: unexpected byte ${b} ('${String.fromCharCode(b)}') at ${this.pos}`);
+  }
+
+  // Parse << ... >>. Push frame on stack; do NOT decrement depth.
+  // Caller reads stack frame at index this._depth - 1 and decrements.
+  parseDict() {
+    const d = this._depth++;
+    if (d >= 64) throw new Error('measure-pass: dict depth overflow at ' + this.pos);
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+    this._stLength[d]  = -1;
+    this._stIsObjStm[d] = 0;
+    this._stN[d]      = -1;
+    this._stFirst[d]  = -1;
+
+    this.pos += 2;
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len) {
+      if (buf[this.pos] === GT && buf[this.pos + 1] === GT) break;
+      if (buf[this.pos] !== SLASH) throw new Error('measure-pass: expected name at ' + this.pos);
+
+      const tag = this.matchDictKey();
+      this.skipWS();
+
+      if (tag === 1 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stLength[d] = v;
+      } else if (tag === 2 && buf[this.pos] === SLASH) {
+        if (this._isNameAt(this.pos + 1, 'ObjStm')) this._stIsObjStm[d] = 1;
+        this.pos++;
+        this._skipNameBody();
+        this.numNames++;
+      } else if (tag === 3 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stN[d] = v;
+      } else if (tag === 4 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stFirst[d] = v;
+      } else {
+        this.parseObject();
+      }
+      this.skipWS();
+      count++;
+    }
+    this.pos += 2;
+
+    this.numDicts++;
+    this.numDictSlots += count * 2;
+    if (count * 2 > this.maxDictSlots) this.maxDictSlots = count * 2;
+  }
+
+  parseArray() {
+    const d = this._depth++;
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+
+    this.pos++;
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len && buf[this.pos] !== RB) {
+      this.parseObject();
+      this.skipWS();
+      count++;
+    }
+    this.pos++;
+
+    this.numArrays++;
+    this.numArraySlots += count;
+    if (count > this.maxArraySlots) this.maxArraySlots = count;
+    this._depth--;
+  }
+
+  // ---- Indirect object + stream handling --------------------------
+
+  findEndStream(from) {
+    const buf = this.buf, len = this._len;
+    let p = from;
+    while (p + 9 <= len) {
+      if (buf[p] === e_ && buf[p+1] === n_ && buf[p+2] === d_ &&
+          buf[p+3] === s_ && buf[p+4] === t_ && buf[p+5] === r_ &&
+          buf[p+6] === e_ && buf[p+7] === a_ && buf[p+8] === m_) {
+        let end = p;
+        while (end > from && (buf[end-1] === LF || buf[end-1] === CR)) end--;
+        return end;
+      }
+      p++;
+    }
+    throw new Error('measure-pass: endstream not found from ' + from);
+  }
+
+  processObjStm(start, end, N, first) {
+    const compressed = this.buf.subarray(start, end);
+    let inflated;
+    try {
+      inflated = inflateSync(compressed);
+    } catch (e) {
+      console.warn(`measure-pass: inflate failed at ${start}: ${e.message}`);
+      return;
+    }
+    this.totalInflatedBytes += inflated.length;
+    this.numObjStmInnerObjects += N;
+
+    if (N > this._objOffsets.length) {
+      this._objOffsets = new Int32Array(N);
+      this._objNums    = new Int32Array(N);
+    }
+
+    const saveBuf = this.buf, savePos = this.pos, saveLen = this._len;
+    this.buf = inflated;
+    this.pos = 0;
+    this._len = inflated.length;
+
+    for (let i = 0; i < N; i++) {
+      this.skipWS();
+      this._objNums[i] = this._skipInt();
+      this.skipWS();
+      this._objOffsets[i] = this._skipInt();
+    }
+    for (let i = 0; i < N; i++) {
+      this.pos = first + this._objOffsets[i];
+      const d0 = this._depth;
+      this.parseObject();
+      this._depth = d0;
+    }
+
+    this.buf = saveBuf;
+    this.pos = savePos;
+    this._len = saveLen;
+  }
+
+  parseIndirectObject() {
+    this.skipWS();
+    this._skipInt();
+    this.skipWS();
+    this._skipInt();
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    if (!(this.pos + 3 <= len && buf[this.pos] === o_ && buf[this.pos+1] === b_ && buf[this.pos+2] === j_)) {
+      throw new Error('measure-pass: expected "obj" at ' + this.pos);
+    }
+    this.pos += 3;
+    this.skipWS();
+    this.numIndirectObjects++;
+
+    const frameDepth = this._depth;
+    let wasDict = false;
+    if (this.pos + 2 <= len && buf[this.pos] === LT && buf[this.pos+1] === LT) {
+      this.parseDict();
+      wasDict = true;
+    } else {
+      this.parseObject();
+    }
+    this.skipWS();
+
+    if (wasDict && this.pos + 6 <= len &&
+        buf[this.pos] === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === r_ &&
+        buf[this.pos+3] === e_ && buf[this.pos+4] === a_ && buf[this.pos+5] === m_) {
+      this.pos += 6;
+      if (this.pos < len && buf[this.pos] === CR) this.pos++;
+      if (this.pos < len && buf[this.pos] === LF) this.pos++;
+
+      const streamStart = this.pos;
+      const length    = this._stLength[frameDepth];
+      const isObjStm  = this._stIsObjStm[frameDepth];
+      const N         = this._stN[frameDepth];
+      const first     = this._stFirst[frameDepth];
+
+      let streamEnd;
+      if (length > 0) {
+        streamEnd = streamStart + length;
+        if (streamEnd > len ||
+            !(buf[streamEnd] === LF || buf[streamEnd] === CR ||
+              buf[streamEnd] === e_ || IsWS[buf[streamEnd]])) {
+          streamEnd = this.findEndStream(streamStart);
+        }
+      } else {
+        streamEnd = this.findEndStream(streamStart);
+      }
+      this.pos = streamEnd;
+      this.totalStreamBytes += (streamEnd - streamStart);
+      this.numStreams++;
+
+      if (isObjStm && N > 0 && first > 0) {
+        this.numObjStms++;
+        this.processObjStm(streamStart, streamEnd, N, first);
+        this.pos = streamEnd;
+      }
+
+      this.skipWS();
+      if (this.pos + 9 <= len &&
+          buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === t_ && buf[this.pos+5] === r_ &&
+          buf[this.pos+6] === e_ && buf[this.pos+7] === a_ && buf[this.pos+8] === m_) {
+        this.pos += 9;
+      }
+      this.skipWS();
+    }
+
+    if (wasDict) this._depth = frameDepth;
+
+    this.skipWS();
+    if (this.pos + 6 <= len &&
+        buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+        buf[this.pos+3] === o_ && buf[this.pos+4] === b_ && buf[this.pos+5] === j_) {
+      this.pos += 6;
+    }
+  }
+
+  walk() {
+    const buf = this.buf, len = this._len;
+
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (IsDigit[b]) {
+        const save = this.pos;
+        this._skipInt();
+        if (buf[this.pos] === SP || buf[this.pos] === TAB) {
+          this.skipWS();
+          if (IsDigit[buf[this.pos]]) {
+            this._skipInt();
+            this.skipWS();
+            if (this.pos + 3 <= len && buf[this.pos] === o_ &&
+                buf[this.pos+1] === b_ && buf[this.pos+2] === j_) {
+              this.pos = save;
+              break;
+            }
+          }
+        }
+        this.pos = save + 1;
+      } else {
+        this.pos++;
+      }
+    }
+
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (b === x_) break;
+      if (b === t_ && buf[this.pos+1] === r_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === 105 /* i */) break;
+      if (b === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === r_ && buf[this.pos+4] === t_) break;
+      if (!IsDigit[b]) break;
+      this.parseIndirectObject();
+    }
+  }
+}
+
+// ---- Convenience wrapper -------------------------------------------
+
+export function measure(bytes) {
+  const m = new Measurer(bytes);
+  m.walk();
+  return {
+    indirectObjects:    m.numIndirectObjects,
+    dicts:              m.numDicts,
+    dictSlots:          m.numDictSlots,
+    arrays:             m.numArrays,
+    arraySlots:         m.numArraySlots,
+    refs:               m.numRefs,
+    names:              m.numNames,
+    numbers:            m.numNumbers,
+    strings:            m.numStrings,
+    hexStrings:         m.numHexStrings,
+    streams:            m.numStreams,
+    objStms:            m.numObjStms,
+    objStmInner:        m.numObjStmInnerObjects,
+    maxDictSlots:       m.maxDictSlots,
+    maxArraySlots:      m.maxArraySlots,
+    maxRecursion:       m.maxRecursionDepth,
+    totalStreamBytes:   m.totalStreamBytes,
+    totalInflatedBytes: m.totalInflatedBytes,
+  };
+}
diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs
new file mode 100644
index 0000000..0069871
--- /dev/null
+++ b/docs/lib/parallel-deflate.mjs
@@ -0,0 +1,187 @@
+// Drop-in async replacement for `pdfDoc.save({ useObjectStreams: true })`
+// that parallelises the per-object-stream deflate work onto libuv's
+// thread pool. Sole exported entry point: `parallelSave(pdfDoc, opts)`.
+//
+// Why: pdf-lib's PDFStreamWriter.computeBufferSize creates one
+// PDFObjectStream per chunk, then immediately calls
+// computeIndirectObjectSize on each. sizeInBytes() walks the Cache,
+// which lazy-populates via a deflate of the unencoded contents. The
+// whole pass is synchronous, so the per-chunk zlib work runs serially
+// -- accounted for ~30 % of save() wall time on the book before this.
+//
+// What: same construction logic as PDFStreamWriter, split into three
+// phases:
+//   1. classify uncompressed vs compressed (same as upstream)
+//   2. instantiate every PDFObjectStream up-front, then `await
+//      Promise.all` an async node:zlib.deflate per stream so libuv's
+//      thread pool (default 4) runs them concurrently
+//   3. size + emit (same as upstream, but every cache.access() is a hit)
+// The xrefStream is one more PDFFlateStream whose contents depend on
+// the offsets computed in phase 3; we pre-deflate it once via
+// node:zlib.deflateSync right after those offsets are pinned, so even
+// that final stream never falls back to pdf-lib's pure-JS deflate.
+//
+// Output: byte-near-equivalent to pdfDoc.save({ useObjectStreams: true }).
+// node:zlib's match choices in the LZ77 inner loop may differ from
+// pdf-lib's default deflate library, producing 1-byte-level stream
+// content and matching /Length deltas; viewer-invisible.
+//
+// Parallelism is bounded by UV_THREADPOOL_SIZE (default 4). Bump it via
+// `process.env.UV_THREADPOOL_SIZE = '8'` before any libuv work fires
+// if you want more concurrency.
+
+import { deflate, deflateSync } from 'node:zlib';
+import { promisify } from 'node:util';
+import {
+  PDFStreamWriter,
+  PDFObjectStream,
+  PDFCrossRefStream,
+  PDFRef,
+  PDFName,
+  PDFNumber,
+  PDFInvalidObject,
+  PDFStream,
+  PDFHeader,
+  PDFTrailer,
+} from 'pdf-lib';
+
+const deflateAsync = promisify(deflate);
+
+class ParallelStreamWriter extends PDFStreamWriter {
+  constructor(context, encodeStreams, objectsPerStream, parallel) {
+    // PDFWriter's second ctor param is objectsPerTick -- the yield knob
+    // that drives shouldWaitForTick. fast-sync-load.mjs rips out every
+    // caller of shouldWaitForTick on both the parser and writer sides,
+    // so the value here is vestigial. Pass Infinity for explicitness.
+    super(context, Infinity, encodeStreams, objectsPerStream);
+    this._lastPrecompressed = 0;
+    this._parallel = parallel;
+  }
+
+  async computeBufferSize() {
+    let objectNumber = this.context.largestObjectNumber + 1;
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xrefStream = PDFCrossRefStream.create(
+      this.createTrailerDict(),
+      this.encodeStreams,
+    );
+
+    const uncompressedObjects = [];
+    const compressedChunks = [];
+    const objectStreamRefs = [];
+
+    // ----- Phase 1: classify -----
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let i = 0; i < indirectObjects.length; i++) {
+      const indirectObject = indirectObjects[i];
+      const [ref, object] = indirectObject;
+      const shouldNotCompress =
+        ref === this.context.trailerInfo.Encrypt ||
+        object instanceof PDFStream ||
+        object instanceof PDFInvalidObject ||
+        ref.generationNumber !== 0;
+
+      if (shouldNotCompress) {
+        uncompressedObjects.push(indirectObject);
+        xrefStream.addUncompressedEntry(ref, size);
+        size += this.computeIndirectObjectSize(indirectObject);
+      } else {
+        let chunk = compressedChunks.length === 0 ? null : compressedChunks[compressedChunks.length - 1];
+        let objectStreamRef = objectStreamRefs.length === 0 ? null : objectStreamRefs[objectStreamRefs.length - 1];
+        if (!chunk || chunk.length % this.objectsPerStream === 0) {
+          chunk = [];
+          compressedChunks.push(chunk);
+          objectStreamRef = PDFRef.of(objectNumber++);
+          objectStreamRefs.push(objectStreamRef);
+        }
+        xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length);
+        chunk.push(indirectObject);
+      }
+    }
+
+    // ----- Phase 2: instantiate object streams and parallel-deflate -----
+    const objectStreams = compressedChunks.map(chunk =>
+      PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams),
+    );
+
+    if (this._parallel && this.encodeStreams && objectStreams.length > 0) {
+      // Fire each deflate onto libuv as soon as its buffer is built,
+      // so deflate of stream N runs concurrently with the build of
+      // N+1..453 instead of after all 453 builds finish. Saves the
+      // main-thread idle wait at the Promise.all (~30 ms on the book).
+      const deflated = await Promise.all(
+        objectStreams.map(os => deflateAsync(os.getUnencodedContents())),
+      );
+      for (let i = 0; i < objectStreams.length; i++) {
+        objectStreams[i].contentsCache.value = deflated[i];
+      }
+      this._lastPrecompressed = objectStreams.length;
+    } else {
+      this._lastPrecompressed = 0;
+    }
+
+    // ----- Phase 3: size object streams (cache hits) -----
+    for (let i = 0; i < objectStreams.length; i++) {
+      const ref = objectStreamRefs[i];
+      const objectStream = objectStreams[i];
+      xrefStream.addUncompressedEntry(ref, size);
+      size += this.computeIndirectObjectSize([ref, objectStream]);
+      uncompressedObjects.push([ref, objectStream]);
+    }
+
+    // ----- xrefStream wrap-up -----
+    // Its contents depend on the offsets computed above, so we can only
+    // populate them now. One stream -- deflate sync via node:zlib and
+    // pre-populate the cache so the subsequent computeIndirectObjectSize
+    // is a cache hit (otherwise pdf-lib's lazy populate would run its
+    // own deflate library on the main thread).
+    const xrefStreamRef = PDFRef.of(objectNumber++);
+    xrefStream.dict.set(PDFName.of('Size'), PDFNumber.of(objectNumber));
+    xrefStream.addUncompressedEntry(xrefStreamRef, size);
+    const xrefOffset = size;
+    if (this.encodeStreams) {
+      xrefStream.contentsCache.value = deflateSync(xrefStream.getUnencodedContents());
+    }
+    size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]);
+    uncompressedObjects.push([xrefStreamRef, xrefStream]);
+
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+
+    return { size, header, indirectObjects: uncompressedObjects, trailer };
+  }
+}
+
+/**
+ * Replacement for `pdfDoc.save({ useObjectStreams: true })` with parallel
+ * deflate. Mirrors PDFDocument.save's pre-serialize steps (addDefaultPage,
+ * updateFieldAppearances, flush) before invoking the patched writer.
+ *
+ * Returns { bytes: Uint8Array, streamCount: number }.
+ */
+export async function parallelSave(pdfDoc, options = {}) {
+  const {
+    addDefaultPage = true,
+    updateFieldAppearances = true,
+    objectsPerStream = 50,
+    encodeStreams = true,
+    parallel = true,
+  } = options;
+
+  if (addDefaultPage && pdfDoc.getPageCount() === 0) pdfDoc.addPage();
+  if (updateFieldAppearances) {
+    const form = pdfDoc.formCache.getValue();
+    if (form) form.updateFieldAppearances();
+  }
+  await pdfDoc.flush();
+
+  const writer = new ParallelStreamWriter(
+    pdfDoc.context,
+    encodeStreams,
+    objectsPerStream,
+    parallel,
+  );
+  const bytes = await writer.serializeToBuffer();
+  return { bytes, streamCount: writer._lastPrecompressed };
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index e7ad9bf..08117d5 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -31,9 +31,171 @@ import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve } from 'node:path';
 import { writeFileSync, existsSync } from 'node:fs';
 import puppeteer from 'puppeteer';
-import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { PDFDocument } from 'pdf-lib';
+// Side-effecting imports. Mutate pdf-lib's live module exports
+// before any pdf-lib operation -- order doesn't matter. See
+// perf/notes/08-pdf-lib.md.
+//
+//   fast-refs-class   -- dense-array cache in front of PDFRef.of for
+//     the gen=0 case (82 % of ~1.2 M calls per load) PLUS a
+//     class-constructor shape for the PDFRef instance, AND drops
+//     the per-instance `tag` string (toString / sizeInBytes /
+//     copyBytesInto compute from objectNumber / generationNumber
+//     directly via _writeUint + _digitCount helpers). Replaces the
+//     `Object.create(PDFRef.prototype) + property writes` pattern of
+//     the older fast-refs.mjs shim, which V8 routes through the
+//     slow-property path: PDFRef ended up at ~60 B/instance vs
+//     PDFName's ~31 B (`new PDFName(...)`-built). The constructor
+//     gives V8 a stable hidden class from the first instance and
+//     drops per-instance cost to ~44 B. On the book (226 k unique
+//     PDFRefs) the combined effect is ~3.87 MB heap (-8.5 % of
+//     total process-phase allocation) and ~140 ms wall-clock (-12 %
+//     of process) on top of the tag-drop refinement that already
+//     trimmed parseIndirectObjectHeader by ~4.3 MB. Same prototype
+//     methods, same instanceof semantics; the only change is the
+//     construction style. See "fast-refs-class" in
+//     perf/notes/08-pdf-lib.md. fast-refs.mjs stays in the tree as
+//     an A/B baseline (mutex-checked in measure.mjs).
+//   fast-inflate     -- swaps pako.inflate for node:zlib.inflateSync
+//     on the one pdf-lib call site that uses it
+//     (PDFCrossRefStreamParser during load). Negligible cost shift,
+//     but eliminates the last pdf-lib -> pako call at runtime.
+//   fast-parse-number -- direct-integer accumulators in front of
+//     BaseParser.parseRawNumber + parseRawInt, skipping per-byte
+//     string concat and the trailing Number() round-trip. Touches
+//     every numeric token parsed during PDFDocument.load.
+//   fast-decode-name -- cache in front of PDFName.of that skips
+//     the decodeName regex scan when the input has no `#` (which
+//     is 99.999 % of the ~2.8 M PDFName.of calls per load).
+//   fast-number-to-string -- short-circuit numberToString when
+//     `String(num)` already lacks an `e` (i.e. for every PDF number
+//     that's not in the exponential-notation tail). Skips a
+//     redundant toString + split + parseInt per call.
+//   fast-size-in-bytes -- replace utils.sizeInBytes (which allocates
+//     `n.toString(2)` just to count its bit length) with a non-
+//     allocating short-circuit ladder. Called ~300 k times per save
+//     from PDFCrossRefStream's xref writer.
+//   fast-dict-onebuf -- one long-lived buffer for every committed
+//     PDFDict entry across the whole document. Parser uses a small
+//     per-instance temp array as a stack of recursion frames; each
+//     parseDict invocation appends to temp, commits its frame to
+//     main in one contiguous append, and pops temp back. PDFDicts
+//     only ever read from main, so a packed (start, length, owned)
+//     Number is the whole instance state -- no separate bufIdx.
+//     Owned dicts (factory-created post-parse) also append to main.
+//     Mutations: in-place replace for existing keys, COW (copy
+//     range to tail, push new pair) for new keys or delete.
+//     PDFContext is a singleton -- one PDFDocument.load per
+//     process; a second distinct context throws. Subsumes
+//     fast-dict-array. Process-phase heap traffic drops from the
+//     Map-backed baseline of ~152 MB down to ~66 MB (-57%); -22%
+//     beyond fast-dict-array. See "One-buffer PDFDict" in
+//     perf/notes/08-pdf-lib.md.
+//
+//     Earlier dict-shape shims (fast-dict-array, fast-dict-iter,
+//     fast-parse-dict) stay in the tree as A/B baselines but are
+//     mutually exclusive with --fast-dict-onebuf in measure.mjs.
+//   fast-parse-object -- replace PDFObjectParser.prototype.parseObject
+//     with a first-byte-dispatch version that gates the three
+//     matchKeyword (true / false / null) scans behind a byte check.
+//     parseObject fires per dict value / array element / indirect
+//     object body; the upstream version pays three speculative
+//     matchKeyword fail-and-rewind costs on every invocation. Same
+//     semantics, dispatch reordered by observed frequency.
+//   fast-parse-name -- byte-keyed cache in front of
+//     PDFObjectParser.parseName. Upstream builds the name body via
+//     `name += charFromCode(byte)` per byte then hands the result
+//     to PDFName.of (fast-decode-name's string-keyed Map). 99.7 % of
+//     the 1.68 M calls per load on the book are cache hits -- the
+//     same ~5 k unique names show up over and over (Type, Length,
+//     Pages, MediaBox, ...) -- so the per-call string build + hash
+//     is pure overhead on the hot path. The shim scans bytes with
+//     direct buffer access, accumulates a small Smi hash, and
+//     looks up a `Map<hash, PDFName>` keyed by byte content. On
+//     hit (~99.7 %) it returns the PDFName with zero string
+//     allocation; on miss it builds the string in one shot via
+//     String.fromCharCode and routes through the upstream
+//     PDFName.of (which is fast-decode-name's cache on this stack)
+//     so both caches converge on the same PDFName instance. ~80 ms
+//     of process wall-clock saved (-9 %) on the book, mostly on
+//     load (0.41 s -> 0.33 s). +1.3 MB long-lived heap for the
+//     cache itself, a small price for the load-time reduction.
+//   fast-sync-load -- rip the parseSpeed / objectsPerTick /
+//     shouldWaitForTick / waitForTick machinery out of both pdf-lib's
+//     load path (PDFDocument.load + five PDFParser /
+//     PDFObjectStreamParser methods underneath it) and its save path
+//     (PDFWriter.serializeToBuffer + computeBufferSize, plus the
+//     unreachable PDFStreamWriter.computeBufferSize patched for
+//     consistency). Each upstream method is wrapped in __awaiter so
+//     on browsers it can yield to the event loop every objectsPerTick
+//     objects; in Node the gate never fires but every indirect object
+//     still paid for the generator state machine + Promise
+//     allocation. ~135 ms of attributed parser self-time + ~40 ms
+//     writer + an unknowable chunk of the GC row removed; the
+//     parseSpeed / objectsPerTick options drop off all our call sites
+//     in step with this shim.
+//   fast-indirect-objects -- replace PDFContext.indirectObjects
+//     (Map<PDFRef, PDFObject>) with a dense array indexed by
+//     objectNumber for the gen=0 path. After fast-dict-array shipped,
+//     PDFContext.assign's `this.indirectObjects.set(ref, object)` was
+//     the only hot Map.set left in the heap profile (~7 MB of set
+//     traffic from the parser's once-per-indirect-object assign).
+//     Mirror of the fast-refs trick on the value side: dense array
+//     for gen=0, Map fallback for gen!=0. enumerateIndirectObjects
+//     skips its sort when the gen!=0 Map is empty (the common case).
+//     Drops PDFContext.assign out of the CPU top-15 and halves the
+//     remaining set heap traffic.
+//   fast-pdfnumber-pool -- value-keyed cache in front of PDFNumber.of.
+//     Dense array for non-negative integers in [0, 16384), Map
+//     fallback for floats / negatives / out-of-range. PDFs reuse the
+//     same numeric values (page indices, /Count, /N, /MediaBox
+//     dimensions) hundreds of thousands of times against only a few
+//     thousand unique values; pooling collapses parseNumberOrRef's
+//     ~15 MB of PDFNumber allocations to ~0.8 MB. Total process-phase
+//     heap traffic drops ~13 % (123 MB -> 107 MB). PDFNumber is
+//     immutable so sharing is safe.
+//   measure-pass (Phase 1) -- no-allocate byte walker
+//     (docs/lib/measure-pass.mjs) that runs in front of
+//     PDFDocument.load on the raw Chrome PDF and counts dictSlots
+//     + arraySlots. The counts drive setExpectedDictSlots() on
+//     fast-dict-onebuf and setExpectedArraySlots() on
+//     fast-array-onebuf, pre-sizing each shim's backing Array to
+//     the exact measured demand (no V8 growth resizes during load).
+//     Net wall-clock is ~+40 ms on the book (walker costs ~60 ms;
+//     load saves ~20). The bound on mainBuf isn't material on its
+//     own (~60 K slots out of 2.4 M) but commits the two-pass
+//     shape. Phase 2/3/3β (Float64Array mainBuf + encoded slots)
+//     were explored and didn't ship -- per-slot encode/decode cost
+//     exceeded the mark-phase savings. See "Phase 1: pre-size
+//     mainBuf via measure-pass" in perf/notes/08-pdf-lib.md.
+//   fast-array-onebuf -- same range-view pattern as fast-dict-onebuf
+//     applied to PDFArray. Each PDFArray's per-instance
+//     `this.array = []` goes away; instances become views into a
+//     shared arrayMain (plain JS Array, heterogeneous slots holding
+//     the original PDFObject references). Reads are direct -- no
+//     decode, unlike the explored-but-didn't-ship encoded approach
+//     which encoded slots into a Float64Array and paid ~300 ms of
+//     decodeValue dispatch during save. ~19 MB of process-phase
+//     heap traffic from parseArray collapses (the `this.array`
+//     allocation + grow doublings across ~79 k PDFArrays). See
+//     "One-buffer PDFArray" in perf/notes/08-pdf-lib.md.
+import './lib/fast-refs-class.mjs';
+import './lib/fast-inflate.mjs';
+import './lib/fast-parse-number.mjs';
+import './lib/fast-decode-name.mjs';
+import './lib/fast-number-to-string.mjs';
+import './lib/fast-size-in-bytes.mjs';
+import { setExpectedDictSlots }     from './lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots }    from './lib/fast-array-onebuf.mjs';
+import './lib/fast-parse-object.mjs';
+import './lib/fast-parse-name.mjs';
+import './lib/fast-sync-load.mjs';
+import './lib/fast-indirect-objects.mjs';
+import './lib/fast-pdfnumber-pool.mjs';
+import { measure as measureRawPdf } from './lib/measure-pass.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
+import { parallelSave }             from './lib/parallel-deflate.mjs';
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 
@@ -245,15 +407,33 @@ try {
   console.log(`generate: ${fmtMs(Date.now() - tGenerate)}  (raw ${(rawPdf.length / 1024 / 1024).toFixed(1)} MB)`);
 
   // Process -- pdf-lib roundtrip with outline + metadata attached.
-  // parseSpeed: Fastest and objectsPerTick: Infinity are critical:
-  // pdf-lib's defaults yield to the event loop between every 100/50
-  // objects, turning a ~5 s round-trip into ~40 s on a 50 MB PDF
-  // (~35 s of which is pure V8 idle). See perf/README.md.
+  // fast-sync-load strips the waitForTick yield gates on both load
+  // and save sides entirely (load was ~40 s under pdf-lib's Slow
+  // default that yields every 100 objects; ~5 s on Fastest; now
+  // ~1 s with the gates ripped out -- so parseSpeed / objectsPerTick
+  // no longer matter and drop from the call sites).
+  //
+  // parallelSave (vs the default pdfDoc.save):
+  //  - objectsPerStream: 500 -- larger object-stream chunks compress
+  //    better (shared deflate window), 5 % smaller output PDF, and
+  //    cuts the per-chunk dispatch overhead 10x.
+  //  - dispatches every chunk's deflate to libuv's thread pool via
+  //    async zlib.deflate instead of running serially on the main
+  //    thread. Moves ~300 ms of zlib work off-CPU on the book.
+  //
+  // measureRawPdf walks rawPdf once with no allocations and hands
+  // the exact dictSlot + arraySlot counts to fast-dict-onebuf /
+  // fast-array-onebuf so each shim's backing Array is pre-sized;
+  // eliminates V8 growth resizes during load.
+  // See perf/notes/08-pdf-lib.md.
   const tProcess = Date.now();
-  const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest });
+  const counts = measureRawPdf(rawPdf);
+  setExpectedDictSlots(counts.dictSlots);
+  setExpectedArraySlots(counts.arraySlots);
+  const pdfDoc = await PDFDocument.load(rawPdf);
   setMetadata(pdfDoc, meta);
   await setOutline(pdfDoc, outline, false);
-  const finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+  const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerStream: 500 });
   console.log(`process:  ${fmtMs(Date.now() - tProcess)}`);
 
   writeFileSync(outputPath, Buffer.from(finalPdf));
diff --git a/package-lock.json b/package-lock.json
index 4d32d0a..3e17b77 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,8 @@
       "version": "0.0.0",
       "devDependencies": {
         "html-entities": "^2.6.0",
-        "pdf-lib": "^1.17.1",
-        "puppeteer": "^25.0.4"
+        "pdf-lib": "1.17.1",
+        "puppeteer": "25.0.4"
       }
     },
     "node_modules/@babel/code-frame": {
diff --git a/package.json b/package.json
index ba6093a..3dce871 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,7 @@
   "description": "PDF book pipeline and profiling harness for the twinBASIC documentation",
   "devDependencies": {
     "html-entities": "^2.6.0",
-    "pdf-lib": "^1.17.1",
-    "puppeteer": "^25.0.4"
+    "pdf-lib": "1.17.1",
+    "puppeteer": "25.0.4"
   }
 }
diff --git a/perf/.gitignore b/perf/.gitignore
index df01c96..001fa9e 100644
--- a/perf/.gitignore
+++ b/perf/.gitignore
@@ -1,3 +1,4 @@
 results/
 ab-css/
 ab-css-*/
+raw.pdf
diff --git a/perf/README.md b/perf/README.md
index 81e67b6..f3637cc 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -61,6 +61,361 @@ Drop `--render-only` whenever you need to also measure generate /
 process (e.g. confirming a fix doesn't shift cost into `page.pdf()`
 or pdf-lib), or to write `book.pdf` for behavioural verification.
 
+## Profiling pdf-lib (process phase): canonical command
+
+The mirror command for CPU-profiling the pdf-lib roundtrip (run from
+`perf/`):
+
+```
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-parse-name --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+```
+
+`--out results/<label>` is optional but recommended: omit it and the
+run lands in `results/<ISO-timestamp>/`, which is fine for one-off
+captures but awkward to refer to later. For A/B work, label both
+sides (`results/pre-foo`, `results/post-foo`).
+
+Then read the bottom-up table:
+
+```
+node analyze-profile.mjs results/<label>/process.cpuprofile --top 15
+```
+
+Flag rationale:
+
+- `--fast-refs` -- inject the
+  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shim:
+  dense-array cache for `PDFRef.of`'s gen=0 path; on miss,
+  constructs the `PDFRef` directly via
+  `Object.create(PDFRef.prototype)` + manual field init, bypassing
+  the upstream `pool.set(tag, instance)` and dropping the
+  per-instance `tag` string (`toString` / `sizeInBytes` /
+  `copyBytesInto` compute from `objectNumber` /
+  `generationNumber` directly). After `--fast-indirect-objects`
+  shipped, the upstream pool was the last hot `Map.set` in the
+  heap profile; this drops the `PDFRef.of` row off the CPU top-15
+  and the `set` builtin row from ~7.5 MB to ~0.5 MB. The
+  tag-drop layer then collapses
+  `parseIndirectObjectHeader` 13.7 MB → 9.3 MB and total process
+  heap 51.9 MB → 45.2 MB (-13 %). **A/B baseline only** since
+  `--fast-refs-class` shipped: the `Object.create + writes`
+  construction style routes V8 through the slow-property path,
+  ending up at ~60 B/instance vs the constructor version's ~44 B.
+  Mutex with `--fast-refs-class` in the harness.
+- `--fast-refs-class` -- inject the
+  [docs/lib/fast-refs-class.mjs](../docs/lib/fast-refs-class.mjs)
+  shipping fix. Same dense-array cache + tag-drop as
+  `--fast-refs`, but PDFRef instances are built via plain-function
+  constructors rather than `Object.create + property writes`. Two
+  shapes: `_FastRef(objectNumber)` for the gen=0 path (one inline
+  slot) and `_FastRefGen(objectNumber, generationNumber)` for the
+  rare gen!=0 path (two slots, only the xref free entry at
+  object 0 on fresh-Chrome workloads). `generationNumber = 0` is a
+  data-property default on `PDFRef.prototype` so reads on gen=0
+  instances return 0 without an accessor dispatch -- keeps every
+  upstream `.objectNumber` / `.generationNumber` IC monomorphic on
+  the data-property path. V8 gives `new`-built instances a stable
+  hidden class from the first instance; per-instance is 16 B
+  aligned (one slot) for gen=0 vs 24 B for the legacy two-slot
+  shape, ~3.87 MB heap and ~140 ms wall-clock from the
+  constructor-shape change plus another ~1.88 MB from the
+  single-slot variant on top.
+  `_FastRef.prototype = PDFRef.prototype` keeps `instanceof PDFRef`
+  satisfied and resolves method dispatch on the shared prototype
+  (no extra proto-chain hop). gen != 0 has its own `poolGenN` Map
+  keyed by `"N M"` -- the shim is the entire `PDFRef.of` factory
+  now, no upstream pool involved. Production runs through it.
+- `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
+  from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
+  which pre-deflates object streams in parallel on libuv's pool with
+  `objectsPerStream: 500`. Production runs through it; same logic.
+  Moves ~300 ms of zlib work off the main thread, and routes every
+  deflate call through `node:zlib` (no pdf-lib pure-JS fallback).
+  Phase 2's buffer-build + deflate is pipelined: each stream's
+  `deflateAsync(os.getUnencodedContents())` fires on libuv as soon
+  as its buffer is built, overlapping with the build of the next
+  stream instead of running build × 453 then deflate × 453 as
+  serial passes. Saves another ~47 ms on save (-10 %); the
+  `(idle)` row at the `Promise.all` gate (was 21 ms / 2.8 %)
+  drops out of the CPU top-15.
+- `--fast-decode-name` -- inject
+  [docs/lib/fast-decode-name.mjs](../docs/lib/fast-decode-name.mjs), a
+  parallel `Map<string, PDFName>` in front of `PDFName.of` that
+  skips the `decodeName` regex scan when the raw name has no `#`
+  hex escape (99.999 % of the ~2.8 M `PDFName.of` calls per load).
+  Production runs through it; ~530 ms saved on process.
+- `--fast-number-to-string` -- inject
+  [docs/lib/fast-number-to-string.mjs](../docs/lib/fast-number-to-string.mjs),
+  short-circuiting pdf-lib's `numberToString` when `String(num)`
+  already lacks an `e` (i.e. for every PDF number that isn't in
+  the exponential-notation tail -- 100 % of ~290 k calls on the
+  book). Skips a redundant `toString` + `split` + `parseInt` per
+  call. Production runs through it. Profile self-time on the
+  function drops from ~45-50 ms (~2 % of process) to ~5-12 ms.
+- `--fast-size-in-bytes` -- inject
+  [docs/lib/fast-size-in-bytes.mjs](../docs/lib/fast-size-in-bytes.mjs),
+  replacing pdf-lib's `utils.sizeInBytes` (which allocates
+  `n.toString(2)` just to count its bit length) with a non-
+  allocating short-circuit ladder. Called ~300 k times per save
+  from `PDFCrossRefStream`'s xref writer; the dominant inputs
+  are 1-2 byte values so a `n < 0x100 ? 1 : ...` ladder catches
+  most calls in one compare. Production runs through it. ~60 ms
+  saved on process.
+- `--fast-inflate` -- inject
+  [docs/lib/fast-inflate.mjs](../docs/lib/fast-inflate.mjs), swapping
+  `pako.inflate` for `node:zlib.inflateSync` on the one path
+  pdf-lib uses it (the compressed xref stream during load).
+  Negligible wall-clock; flag exists so paired A/Bs against pure
+  upstream pdf-lib can keep the rest of the perf set on while
+  isolating this swap. Production runs through it.
+- `--fast-parse-number` -- inject
+  [docs/lib/fast-parse-number.mjs](../docs/lib/fast-parse-number.mjs),
+  replacing `BaseParser.parseRawNumber` / `parseRawInt` with
+  direct-integer accumulators (`n = n*10 + (byte - 0x30)`) that
+  skip per-byte string concat and the trailing `Number()` round-
+  trip. Every numeric token parsed during `PDFDocument.load`
+  flows through these -- hundreds of thousands of calls per load
+  on the book. Production runs through it.
+- `--fast-dict-array` -- inject
+  [docs/lib/fast-dict-array.mjs](../docs/lib/fast-dict-array.mjs),
+  replacing `PDFDict`'s backing `Map` with a flat alternating
+  `[k0, v0, k1, v1, ...]` array allocated per-dict (pre-sized to 10
+  slots, the median). Was production before `--fast-dict-onebuf`
+  superseded it; kept as an A/B baseline. See "Replace PDFDict's
+  backing Map with a flat array" in
+  [notes/08-pdf-lib.md](notes/08-pdf-lib.md).
+- `--fast-dict-onebuf` -- inject
+  [docs/lib/fast-dict-onebuf.mjs](../docs/lib/fast-dict-onebuf.mjs).
+  One long-lived buffer for every committed PDFDict entry across
+  the whole document. Parser uses a small per-parser temp array as
+  a stack of recursion frames; each parseDict invocation appends
+  to temp, commits its frame to main in one contiguous append,
+  and pops temp back. PDFDicts only ever read from main, so the
+  whole instance state packs into one 41-bit Number (23-bit start
+  + 1-bit `normalized` flag + 1-bit `autoNormalizeCTM` flag +
+  16-bit length, in that bit order). Owned dicts (factory-created
+  post-parse, COW results) also append to main. Mutations:
+  in-place replace for existing keys, COW (copy range to tail,
+  append new pair, update encoded range) for new keys or delete --
+  all preserve the two gap bits via `+ (d & GAP_MASK)` after the
+  repack. The wrapper instances themselves use the constructor-
+  based shape `fast-refs-class` introduced for PDFRef -- one
+  plain-function constructor per subclass (`_FastDict`,
+  `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the
+  prototype aliased to the upstream prototype, so V8 sees a stable
+  hidden class from the first instance. PDFPageLeaf collapses to
+  the same single-`d` shape as plain PDFDict, with `normalized` /
+  `autoNormalizeCTM` as prototype getters/setters that mask in/out
+  of bits 23-24. Saves ~20 B/PDFDict × 260 k = ~5.2 MB heap on
+  top of the storage refactor, plus ~26 KB on the 1 651 page
+  leaves from the flag-packing. Mutually exclusive with the other
+  dict-shape shims. ~77 % cumulative heap reduction since the
+  original Map-backed PDFDict (152 -> 35 MB). Production runs
+  through it. See
+  [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
+- `--fast-array-onebuf` -- inject
+  [docs/lib/fast-array-onebuf.mjs](../docs/lib/fast-array-onebuf.mjs).
+  Same range-view pattern as `--fast-dict-onebuf` applied to
+  PDFArray: every committed element lives in a single append-only
+  `arrayMain` JS Array, each PDFArray is a view via packed
+  `(start, length)` in `d`. Backing is a plain heterogeneous JS
+  Array -- slots hold the original PDFObject references, reads are
+  `arrayMain[start + i]` with no decode. This is the explored-but-
+  didn't-ship Phase 3 encoded approach minus the Float64Array
+  encoding (which cost ~300 ms on save's `copyBytesInto` from
+  per-slot `decodeValue` dispatch). Per-parser `_arrayTemp` for
+  the recursion stack, independent of fast-dict-onebuf's
+  `_dictTemp`. Mutations: in-place replace for `set`, in-place
+  extend at HWM for `push`, COW for everything else. Singleton
+  context is duplicated (10 lines) rather than shared so each shim
+  stays independently injectable. Wrapper instances built via a
+  `_FastArray` plain-function constructor (prototype aliased to
+  `PDFArray.prototype`) rather than `Object.create + writes`, the
+  same shape change `fast-refs-class` and `fast-dict-onebuf` made
+  on their factory paths -- worth ~22 B/PDFArray × ~80 k instances
+  = ~1.7 MB heap, but the headline win is that with all three
+  shape changes in place V8 sees fully monomorphic call sites for
+  PDFRef / PDFDict / PDFArray construction and method dispatch,
+  collapsing GC self-time 101 → 59 ms (-42 %) and process
+  wall-clock 1.03 → 0.90 s (-130 ms, -13 %). ~19 MB process-phase
+  heap traffic drops -- collapses parseArray's `this.array = []`
+  + grow doublings across ~79 k PDFArrays. Composes with
+  `--fast-dict-onebuf`. Production runs through it. See
+  [notes/08-pdf-lib.md "One-buffer PDFArray"](notes/08-pdf-lib.md).
+- `--measure-pass` -- inject
+  [docs/lib/measure-pass.mjs](../docs/lib/measure-pass.mjs), the
+  no-allocate byte walker. Runs in front of `PDFDocument.load` on
+  rawPdf, counts dictSlots + arraySlots, hands them to
+  `setExpectedDictSlots()` on `fast-dict-onebuf` and (when on)
+  `setExpectedArraySlots()` on `fast-array-onebuf`, pre-sizing each
+  shim's backing Array to the exact slot count. Eliminates V8
+  growth resizes during load. Net wall-clock ~+40 ms on the book
+  (walker ~60 ms, load saves ~20). Production runs through it -- the
+  bound on mainBuf isn't material on its own (~60 K slots out of
+  2.4 M) but commits the two-pass shape; Phases 2/3/3β (Float64Array
+  mainBuf + encoded slots) were explored and didn't ship. Requires
+  `--fast-dict-onebuf` (mutex-checked). See "Phase 1: pre-size mainBuf
+  via measure-pass" in
+  [notes/08-pdf-lib.md](notes/08-pdf-lib.md).
+- `--fast-parse-object` -- inject
+  [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs),
+  replacing `PDFObjectParser.prototype.parseObject` with a
+  first-byte-dispatch version that gates the three speculative
+  `matchKeyword` calls (`true` / `false` / `null`) behind a byte
+  check. The upstream `parseObject` pays three `matchKeyword`
+  fail-and-rewind costs per dispatch (`bytes.offset()` +
+  `bytes.next()` + comparison + `bytes.moveTo(initialOffset)`)
+  before peeking the dispatch byte, on every call -- and the
+  three keywords are extraordinarily rare in real PDFs. The shim
+  peeks first and only enters `matchKeyword` when the byte could
+  plausibly start a keyword (`t` / `f` / `n`); dispatch order is
+  reshuffled by observed frequency in dict-value position (numbers
+  / refs first, then `<<`, names, arrays, strings). Same
+  semantics. Pulls `parseObject` self-time from ~82 ms (5.2 %)
+  to ~40 ms (3.1 %). Production runs through it.
+- `--fast-parse-name` -- inject
+  [docs/lib/fast-parse-name.mjs](../docs/lib/fast-parse-name.mjs),
+  a byte-keyed cache in front of
+  `PDFObjectParser.prototype.parseName`. Upstream builds the name
+  body via `name += charFromCode(byte)` per byte then hands the
+  result to `PDFName.of`'s string-keyed Map. On the book, 1.68 M
+  parseName calls hit ~5 k unique names (99.7 % cache-hit rate)
+  -- the per-call string build + hash is pure overhead on the hot
+  path. The shim scans bytes with direct buffer access,
+  accumulates a Java-style `hash * 31 + byte` Smi hash in the same
+  pass, and looks up a `Map<hash, Entry | Entry[]>` keyed by byte
+  content; on hit returns the PDFName with zero string allocation.
+  On miss, builds the string in one shot (`String.fromCharCode`
+  with direct args -- not `.apply` on a typed-array view, which is
+  a V8 deopt path) and routes through the upstream `PDFName.of`
+  (fast-decode-name's cache on this stack) so both caches converge
+  on the same PDFName instance. Pulls `parseName` + `fastOf`
+  combined from ~144 ms (~16 % of process) to ~58 ms; -80 ms
+  process wall-clock (-9 %), all on load (0.41 s → 0.33 s).
+  +1.3 MB long-lived heap for the cache itself. Production runs
+  through it.
+- `--fast-sync-load` -- inject
+  [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs),
+  replacing nine `__awaiter`-wrapped methods across pdf-lib's load
+  and save paths with awaiterless twins. Each upstream method is
+  wrapped in tslib `__awaiter` / `__generator` so on browsers it
+  can `await waitForTick()` every `objectsPerTick` objects; in
+  Node the yield gate never fires (objectsPerTick: Infinity), but
+  every indirect object still pays the generator state-machine
+  dispatch + Promise allocation. The shim removes the scaffolding
+  entirely. The `parseSpeed` / `objectsPerTick` options drop off
+  `PDFDocument.load`, `parallelSave`, and `pdfDoc.save` call sites
+  in step. Also short-circuits `skipJibberish` on the digit-byte
+  fast path -- `parseDocument`'s inner loop calls it ~150 k times
+  per load on the book, each call speculatively running
+  `matchKeyword(xref/trailer/startxref)` + `matchIndirectObjectHeader`
+  to confirm what the outer `while`'s `IsDigit` check already
+  proved; peeking the byte first and `continue`-ing on a digit
+  saves ~62 ms on load. Production runs through it.
+- `--fast-indirect-objects` -- inject
+  [docs/lib/fast-indirect-objects.mjs](../docs/lib/fast-indirect-objects.mjs),
+  replacing `PDFContext.indirectObjects` (`Map<PDFRef, PDFObject>`)
+  with a dense array indexed by `objectNumber` for the gen=0 path.
+  Mirror of `--fast-refs` on the value side. After `--fast-dict-array`
+  landed, `PDFContext.assign`'s
+  `this.indirectObjects.set(ref, object)` was the only hot
+  `Map.set` left in the heap profile (~7 MB of `set` traffic,
+  fired once per indirect object during load). Patches `assign` /
+  `lookup` / `lookupMaybe` / `delete` / `getObjectRef` /
+  `enumerateIndirectObjects` to consult the dense array first,
+  Map as gen!=0 fallback (rare on freshly-parsed PDFs). As a side
+  benefit `enumerateIndirectObjects` skips its sort when the
+  gen!=0 Map is empty -- dense-array iteration is already in
+  objectNumber order. Drops `PDFContext.assign` out of the CPU
+  top-15 and halves the remaining `set` heap traffic. Production
+  runs through it.
+- `--fast-pdfnumber-pool` -- inject
+  [docs/lib/fast-pdfnumber-pool.mjs](../docs/lib/fast-pdfnumber-pool.mjs),
+  a value-keyed cache in front of `PDFNumber.of`. Dense array for
+  non-negative integers in `[0, 16384)`, Map fallback for floats
+  / negatives / out-of-range. PDFs reuse the same numeric values
+  (page indices, `/Count`, `/N`, `/MediaBox` dimensions, font
+  sizes) tens-to-hundreds of thousands of times against only a
+  few thousand unique values. `PDFNumber` is immutable so sharing
+  is safe. Collapses `parseNumberOrRef`'s ~15 MB of self-size to
+  ~0.8 MB (just the unique values); drops total process-phase
+  heap traffic by ~13 % (123 MB -> 107 MB). Production runs
+  through it.
+- `--cpu-profile-process` -- attach Node's `inspector/promises`
+  Profiler around the process phase only (skips render and generate).
+  Writes `process.cpuprofile` into the timestamped `results/` folder.
+  The render-phase `--cpu-profile` is CDP / Chromium; this one is
+  Node / Node's V8 -- different runtimes, same `.cpuprofile` JSON
+  shape, so `analyze-profile.mjs` / `find-callers.mjs` /
+  `find-callees.mjs` / `grep-profile.mjs` work against either.
+- `--cpu-sampling 100` -- 100 us sampling. The process phase is now
+  ~2.3 s; at 1 ms default sampling that's only ~2300 samples and the
+  bottom-up table runs noisy. 100 us is the right resolution for
+  this length.
+
+The command intentionally **does not** pass `--cpu-profile`. There's
+no rule against running both at once -- they attach to different V8s
+and don't interfere -- but the render profile dilutes the bottom-up
+view of "what's left in pdf-lib," and the trace files are large.
+Profile one phase at a time.
+
+Why no `--render-only`? `--cpu-profile-process` requires the process
+phase to run; the harness errors out if you combine them.
+
+upstream), drop every `--fast-*` flag and `--parallel-deflate`.
+Caveat for A/B work: profiler-on attribution overstates the cost
+of hot functions called millions of times (`PDFRef.of` in
+particular). For "did this wall-clock change," do a paired
+no-profile A/B as a sanity check.
+
+## Profiling pdf-lib heap allocation (process phase): canonical command
+
+The companion command for the **sampling heap profile** of the
+process phase -- "where is pdf-lib allocating bytes?" rather than
+"where is it spending cycles?" (run from `perf/`):
+
+```
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-parse-name --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+```
+
+Same `--out` / labelling note as the CPU command above: omit it for a
+timestamped folder, pass it for a stable name.
+
+Then read the bottom-up table:
+
+```
+node analyze-heap-profile.mjs results/<label>/process.heapprofile --top 15
+```
+
+Same `--fast-*` set as the CPU command (production is the baseline
+we care about); the new flags:
+
+- `--heap-profile-process` -- attach Node's `inspector/promises`
+  `HeapProfiler` around the process phase only. Writes
+  `process.heapprofile` into the timestamped `results/` folder.
+  Output is V8's sampling-heap-profile JSON (a tree of
+  `{ callFrame, selfSize, children }` rooted at `head`), not the
+  flat-nodes shape that `.cpuprofile` uses, so the cpu analyzers
+  don't apply. Use `analyze-heap-profile.mjs` instead, which walks
+  the tree and aggregates `selfSize` by `(functionName + url:line)`:
+  `node analyze-heap-profile.mjs results/<run>/process.heapprofile --top 10`.
+- `--heap-sampling 512` -- 512-byte sampling interval. V8's default
+  is 32768 (32 KB); on the ~150 MB process-phase allocation total
+  that's only ~5 k samples and the bottom-up table runs coarse.
+  512 B yields ~250 k samples on the book, plenty of resolution
+  for "which frame allocated this Map?". Caveat: 512 B sampling
+  inflates process wall-clock substantially (the sampler's
+  per-allocation bookkeeping fires 64x more often). Read the
+  attribution, not the timing, from heap-profiled runs.
+
+`--heap-profile-process` composes with `--cpu-profile-process` --
+both attach to the same inspector session, so you can capture cpu
+and heap in a single run if you want. The same `--render-only`
+incompatibility applies (no process phase to profile).
+
+See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
+investigations these flags enabled.
+
 ## What's in this folder
 
 The harness and core probes:
@@ -89,6 +444,9 @@ or `--tracing`):
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
+| `find-heap-callers.mjs` | Heap-profile companion to `find-callers.mjs`. Walks a `.heapprofile` tree and attributes a target allocator's (e.g. `set`, `Map`, `String`) self+descendant bytes back to each direct caller. Useful for "where do all these Map.set calls come from?" questions. |
+| `find-heap-callees.mjs` | Other direction: walks a `.heapprofile` tree and lists a target frame's direct children with their (self + subtree) byte totals. Used to crack open mystery rows like "fastParseDictArray has 58 MB of self-size -- what's it actually allocating?". |
+| `heap-subtree.mjs` | "What does this frame actually allocate?" -- prints the subtree under every frame whose name matches a substring, with each direct child's self + descendant total. Companion to `analyze-heap.mjs` and `find-heap-callers.mjs`; use it when a top-15 row's self-size is big but its children look tiny (typical V8 inlining-attribution case). Built during the PDFRef class-shape round to confirm `maybeParseCrossRefSection` had inlined `PDFCrossRefSection.addEntry`'s object literals into its own compiled frame. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
 | `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
@@ -112,11 +470,15 @@ Side experiments / one-shot probes:
 | --- | --- |
 | `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. Auto-pins on Windows via `pin-cpu.mjs`. |
 | `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. Auto-pins on Windows via `pin-cpu.mjs`. |
+| `instrument-pioh.mjs` | Wraps `PDFParser.prototype.parseIndirectObjectHeader` + `matchIndirectObjectHeader` with counters and reports per-load call counts + the kept-heap delta. Built during the "is the 9 MB heap row a real parser hot spot or a V8 inlining-attribution artifact" investigation -- a non-zero `mih` count would mean fast-sync-load's digit short-circuit isn't firing; a non-zero `throws` would mean the speculative-recovery try/catch is firing on production. Run with `node --expose-gc perf/instrument-pioh.mjs`. |
+| `instrument-objclasses.mjs` | Counts every PDF* class touched by a load on `raw.pdf`: per-class `.of()` call count for the pooled wrappers (PDFRef / PDFName / PDFNumber / PDFString / PDFHexString / PDFRawStream / PDFObjectStream) AND a post-load walk of `PDFContext.enumerateIndirectObjects()` bumping per-runtime-class counts for the top-level shapes. Used to size the constructor-shape round (how many of each wrapper is the per-instance cost multiplied by?). Run with `node perf/instrument-objclasses.mjs`. |
 | `probe-chrome-outline.mjs` | Renders a synthetic multi-level h1..h6 document via Chrome's `outline: true` and dumps the resulting `/Outlines` tree. Quick check that the CDP flag is wired correctly in the local Chromium / puppeteer combo. |
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `probe-parallel.mjs` | Two-shard `Promise.all` `page.pdf()` probe -- the cost-of-`pageRanges`-sharding measurement (see *`pageRanges` sharding: off the table for now* in [notes/06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md)). |
 | `probe-idle-browser.mjs` | Standalone probe: launches a headless browser and measures steady-state idle memory + sample-time, for separating render cost from browser-fixed overhead. |
+| `phase0-measure.mjs` | No-allocate byte walker over a raw PDF: recognises the grammar (indirect objects, dicts, arrays, names, numbers, refs, strings, streams, ObjStms) and produces counts only, without instantiating any PDFObject. Viability gate for the two-pass measure-allocate-work architecture that ships as `measure-pass.mjs`. Run with `node perf/phase0-measure.mjs <input.pdf> --runs N`; defaults to the most recent `perf/results/*/book.pdf`. Companion to `--dump-raw-pdf <path>` on `measure.mjs`, which captures the canonical 39 MB Chrome-output input once. |
+| `instrument-slot-types.mjs` | Walks `fast-dict-onebuf`'s `main` buffer after setOutline and classifies each slot by PDFObject subtype, printing key/value counts and percentages. Used to scope the Phase 2 / Phase 3 encoding work -- how many slot-marks would a Float64Array mainBuf actually eliminate, and what's the side-pool fallback rate. Invoked via `--instrument-slot-types` on `measure.mjs` (requires `--fast-dict-onebuf`; mutex with `--incremental` / `--render-only`). |
 
 Documentation:
 
@@ -172,7 +534,9 @@ run.bat path\to\some-other.html           # explicit input
 run.bat --out my-run                      # explicit output directory
 run.bat --no-detach-pages                 # opt out of the detach-pages fix (measure pre-fix O(n²) baseline)
 run.bat --timing                          # collect per-page wall time + heap (writes timing.csv + quartile summary)
-run.bat --cpu-profile                     # CPU-profile the render phase
+run.bat --cpu-profile                     # CPU-profile the render phase (CDP, Chromium-side)
+run.bat --cpu-profile-process             # CPU-profile the process phase (Node inspector, Node-side)
+run.bat --heap-profile-process            # sampling heap-profile the process phase (Node inspector HeapProfiler); pair with --heap-sampling 512 for fine attribution
 run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
 run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
 run.bat --instrument                      # count + time DOM-accessor calls
@@ -180,11 +544,34 @@ run.bat --time-hooks                      # per-task timing of every chunker/pol
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
+run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path + tag-drop (A/B baseline; production now runs --fast-refs-class)
+run.bat --fast-refs-class                 # --fast-refs + class-constructor PDFRef shape for stable V8 hidden class (also ships; opt-in here for A/B)
+run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
+run.bat --fast-decode-name                # skip decodeName regex when name has no # (also ships; opt-in here for A/B)
+run.bat --fast-number-to-string           # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B)
+run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B)
+run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
+run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
+run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (Map-shape baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-dict-array                 # replace PDFDict's backing Map with a per-dict flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (A/B baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-dict-onebuf                # ONE long-lived buffer for all PDFDict entries + small per-parser temp (also ships; opt-in here for A/B)
+run.bat --fast-array-onebuf               # ONE long-lived buffer for all PDFArray elements + small per-parser temp; composes with --fast-dict-onebuf (also ships; opt-in here for A/B)
+run.bat --measure-pass --fast-dict-onebuf # walk rawPdf with the no-allocate measure pass and pre-size --fast-dict-onebuf's mainBuf to the exact dict-slot count (Phase 1 of the two-pass architecture; mutex with --incremental and --render-only)
+run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
+run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
+run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
+run.bat --fast-parse-name                 # byte-keyed cache in front of parseName; skip the string build + Map<string, PDFName> hash on the 99.7 % cache-hit path (also ships; opt-in here for A/B)
+run.bat --fast-parse-name                 # byte-keyed cache in front of parseName: skip the string build + Map<string, PDFName> hash on the 99.7 % cache-hit path (also ships; opt-in here for A/B)
+run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
 (loadable in Chrome DevTools -> Performance -> "Load profile...");
-`--instrument` prints a per-op table at end-of-render.
+`--cpu-profile-process` writes `process.cpuprofile` alongside it;
+`--heap-profile-process` writes `process.heapprofile` (loadable in
+Chrome DevTools -> Memory -> "Load profile..."); `--instrument`
+prints a per-op table at end-of-render.
 
 You need `_site-pdf\book.html` to exist first -- run `docs\build.bat`
 (which is `bundle exec jekyll build`) if you haven't already.
@@ -285,6 +672,33 @@ file documenting each:
 | Disable WhiteSpaceFilter | [05](notes/05-blink-trace.md) | ~0.7 s render |
 | Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution |
 | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory |
+| `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) |
+| `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) |
+| Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) |
+| `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) |
+| `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise |
+| `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) |
+| `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) |
+| `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) |
+| Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) |
+| `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) |
+| `PDFDict` flat-array storage (subsumes iter + parseDict shims) | [08](notes/08-pdf-lib.md) | ~48 ms process (Map+set heap -80 %, GC -20 %) |
+| `PDFContext.indirectObjects` dense gen=0 array | [08](notes/08-pdf-lib.md) | `assign` off CPU top-15; remaining `set` heap -48 % |
+| `PDFRef.of` direct-construct on cache miss (skip upstream `pool.set`) | [08](notes/08-pdf-lib.md) | `PDFRef.of` off CPU top-15 (~93 ms); `set` heap 7.7 MB → 0.5 MB |
+| `PDFNumber.of` value-pool (dense int + Map fallback) | [08](notes/08-pdf-lib.md) | `parseNumberOrRef` off heap top-10; total process heap 123 MB → 107 MB (-13 %) |
+| Pre-size `parseDict` accumulator (`new Array(10)` median) | [08](notes/08-pdf-lib.md) | `fastParseDictArray` heap row -25 %; total process heap 107 MB → 92 MB (-14 %) |
+| One-buffer `PDFDict` (single mainBuf + packed 53-bit instance) | [08](notes/08-pdf-lib.md) | total process heap 92 MB → 66 MB (-28 %); cumulative -57 % since Map-backed PDFDict |
+| `measure-pass` (Phase 1) wired into production via `setExpectedDictSlots()` | [08](notes/08-pdf-lib.md) | byte-identical output; mainBuf pre-sized exact (no V8 growth resizes); ~+40 ms net process |
+| One-buffer `PDFArray` (single arrayMain + packed (start, length) view) | [08](notes/08-pdf-lib.md) | total process heap 66 MB → 52 MB (-21 %); parseArray off top 15; cumulative -66 % since Map-backed PDFDict |
+| Drop per-instance `PDFRef.tag` string (`copyBytesInto` digit-write, `sizeInBytes` digit-count, `toString` on demand) | [08](notes/08-pdf-lib.md) | `parseIndirectObjectHeader` 13.7 MB → 9.3 MB; total process heap 51.9 MB → 45.2 MB (-13 %) |
+| `skipJibberish` digit-byte fast path (peek before speculative `matchKeyword` + `matchIndirectObjectHeader`) | [08](notes/08-pdf-lib.md) | load mean 0.518 → 0.455 s (-62 ms, -6 %); save flat; byte-identical output |
+| Class-constructor `PDFRef` shape (`new _FastRef(...)` for stable V8 hidden class) | [08](notes/08-pdf-lib.md) | per-PDFRef ~60 B → ~44 B; total process heap 45.3 MB → 41.4 MB (-8.5 %); process wall 1.13 s → 0.99 s (-140 ms, -12 %) |
+| Class-constructor `PDFDict` shape (`_FastDict` / `_FastCatalog` / `_FastPageTree` / `_FastPageLeaf` per-subclass constructors) | [08](notes/08-pdf-lib.md) | `_makeFromRange (dict)` 16.5 MB → 11.4 MB; total process heap 41.4 MB → 35.4 MB (-14.4 %); cumulative -77 % since Map-backed PDFDict |
+| Class-constructor `PDFArray` shape (`_FastArray` factory + monomorphic call-site unlock across all three Fast classes) | [08](notes/08-pdf-lib.md) | total process heap 35.4 MB → 33.7 MB (-4.9 %); process wall 1.03 s → 0.90 s (-130 ms); GC self-time 101 ms → 59 ms (-42 %); cumulative -78 % heap since Map-backed PDFDict, -20 % process across the three shape-change commits |
+| Byte-keyed `parseName` cache (Map<hash, Entry &#124; Entry[]>; skip per-call string build + string-keyed Map hash on 99.7 % hit path) | [08](notes/08-pdf-lib.md) | `parseName` + `fastOf` combined 144 ms → 58 ms; process wall 0.90 s → 0.82 s (-80 ms, -9 %, all on load); +1.3 MB long-lived heap for the cache |
+| Pipeline `parallel-deflate` (overlap buffer-build with libuv deflate by folding two `.map`s into one) | [08](notes/08-pdf-lib.md) | save 0.467 s → 0.420 s (-47 ms, -10 %); `(idle)` row at `Promise.all` gate drops out of CPU top-15 |
+| Pack PDFPageLeaf flags into `d`'s gap bits (`_FastPageLeaf` collapses to single-`d` shape; bit layout shifts to start[0:22] / norm[23] / auto[24] / length[25:40]) | [08](notes/08-pdf-lib.md) | ~26 KB on 1 651 page leaves (sub-row at 512 B sampler); output byte-identical; CPU flat |
+| Two-shape `PDFRef` (gen=0 single-slot `_FastRef` + gen!=0 two-slot `_FastRefGen`; `generationNumber = 0` as prototype default keeps IC monomorphic at every caller) | [08](notes/08-pdf-lib.md) | per-instance 24 B → 16 B aligned; total process heap 34.96 MB → 33.08 MB (-1.88 MB) |
 
 What was tried and didn't ship:
 
@@ -298,7 +712,7 @@ What was tried and didn't ship:
 
 ## Investigation log
 
-The seven phase files in [`notes/`](notes/) cover the full investigation
+The phase files in [`notes/`](notes/) cover the full investigation
 narrative. Each is self-contained but they're written in chronological
 order; later ones reference earlier ones for context.
 
@@ -311,3 +725,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end; the class-shape round left PDFPageLeaf as the only subclass with extra fields (`normalized` default false + `autoNormalizeCTM` default true, both written in the `_FastPageLeaf` constructor body) so the 1 651 page leaves on the book were ~24 B larger than plain `_FastDict` instances -- packing both booleans into `d`'s gap bits collapses PageLeaf to the same single-`d` shape (bit layout shifts from start[0:23] + length[24:37] to start[0:22] + norm[23] + auto[24] + length[25:40], dropping start from 24 to 23 bits / 8.4 M slots vs ~2.3 M mainLen, growing length from 14 to 16 bits / 65 535 vs 8 706 observed max) with the booleans as prototype getters/setters that mask in/out of bits 23-24, and the V8 Smi gotcha worth recording: Smi is 31-bit signed so d > 2^30 (i.e. length >= 32) boxes to HeapNumber where `d | NORM_BIT` would truncate to Int32 and lose the length, so all writes use arithmetic (`d + NORM_BIT` / `d - NORM_BIT` gated on the current bit state) and the COW / set / delete paths preserve the gap bits via `+ (d & GAP_MASK)` after the repack; saves ~26 KB on the 1 651 page leaves (sub-row at 512 B sampler resolution but real, calculated per-instance), output byte-identical, CPU flat (no PageLeaf mutation paths fire on the render-only workflow); the same "shape change interior to construction, IC story at every caller" pattern that drove the PageLeaf collapse also yields a second-pass win on PDFRef -- single-shape `_FastRef` still allocated two inline slots for `objectNumber` + `generationNumber` but `generationNumber` is always zero on fresh-Chrome workloads except for the xref "free" entry at object 0, so splitting into `_FastRef(objectNumber)` (one slot, gen=0 path) + `_FastRefGen(objectNumber, generationNumber)` (two slots, rare gen!=0 path) with `PDFRef.prototype.generationNumber = 0` as a data-property default supplies the missing field via prototype lookup -- crucial that this is a data-property default not an accessor, because a first-attempt packed-`d` + getter variant regressed +1.6 MB heap / +70 ms CPU by breaking V8's monomorphic ICs at every caller of `ref.objectNumber` / `ref.generationNumber` (PDFCrossRefSection.append, PDFCrossRefStream entry tuples, PDFWriter.serializeToBuffer, fast-indirect-objects, the `{ref, offset, deleted}` literals in `addEntry`), couldn't elide the literals as aggressively under accessor dispatch, recompilation paths landed with worse code than the two-slot baseline; the two-shape data-property variant pays in a bounded place (one extra hidden class for the rare path) without touching any caller's IC, saving 8 B per gen=0 instance × 226 k unique = 1.88 MB heap on the book (34.96 MB → 33.08 MB total sampled), with output byte-identical and the gen!=0 Map (`poolGenN` keyed by `"N M"`) replacing the upstream PDFRef.of fallback entirely. |
diff --git a/perf/find-heap-callees.mjs b/perf/find-heap-callees.mjs
new file mode 100644
index 0000000..abdb70a
--- /dev/null
+++ b/perf/find-heap-callees.mjs
@@ -0,0 +1,70 @@
+// Show what a target frame allocates under itself.
+//
+// Reads a V8 .heapprofile (tree of { callFrame, selfSize, children }
+// rooted at `head`) and, for every node whose callFrame.functionName
+// matches the given target, lists its direct child frames with their
+// (self + subtree) byte totals. Answers "what does function X
+// actually allocate?".
+//
+// Companion to find-heap-callers.mjs. Where find-heap-callers walks
+// up (target's parents), this walks down (target's children).
+//
+// Usage:
+//   node perf/find-heap-callees.mjs <profile> <calleeName>
+//
+// Example:
+//   node perf/find-heap-callees.mjs results/<run>/process.heapprofile fastParseDictArray
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, targetName] = process.argv.slice(2);
+if (!profilePath || !targetName) {
+  console.error('usage: node find-heap-callees.mjs <profile> <calleeName>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+
+function subtreeBytes(n) {
+  let total = n.selfSize || 0;
+  for (const c of n.children || []) total += subtreeBytes(c);
+  return total;
+}
+
+const childTotals = new Map();
+const childSelfs = new Map();
+let targetSelf = 0;
+let targetSubtree = 0;
+
+function walk(n) {
+  const name = n.callFrame?.functionName || '';
+  if (name === targetName) {
+    targetSelf += n.selfSize || 0;
+    targetSubtree += subtreeBytes(n);
+    for (const c of n.children || []) {
+      const cf = c.callFrame || {};
+      const cname = cf.functionName || '(anonymous)';
+      const url = cf.url || '';
+      const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+      const key = `${cname} @ ${url ? url.replace(/^file:\/\/\//, '') : '(no url)'}:${line}`;
+      const subtree = subtreeBytes(c);
+      const self = c.selfSize || 0;
+      childTotals.set(key, (childTotals.get(key) || 0) + subtree);
+      childSelfs.set(key, (childSelfs.get(key) || 0) + self);
+    }
+  }
+  for (const c of n.children || []) walk(c);
+}
+walk(profile.head);
+
+console.log(`${targetName}: self=${(targetSelf / 1024).toFixed(2)} KB, subtree=${(targetSubtree / 1024 / 1024).toFixed(2)} MB`);
+console.log('direct children (subtree KB / self KB):');
+const rows = [...childTotals.entries()]
+  .map(([k, subtree]) => ({ k, subtree, self: childSelfs.get(k) || 0 }))
+  .sort((a, b) => b.subtree - a.subtree);
+for (const r of rows) {
+  const subKb = r.subtree / 1024;
+  if (subKb < 10) continue;
+  const selfKb = r.self / 1024;
+  console.log(`  ${subKb.toFixed(2).padStart(10)} KB  (self ${selfKb.toFixed(2).padStart(8)} KB)   ${r.k}`);
+}
diff --git a/perf/find-heap-callers.mjs b/perf/find-heap-callers.mjs
new file mode 100644
index 0000000..beaf062
--- /dev/null
+++ b/perf/find-heap-callers.mjs
@@ -0,0 +1,69 @@
+// Attribute a heap allocator's self+descendant bytes to each direct caller.
+//
+// Reads a V8 .heapprofile (tree of { callFrame, selfSize, children }
+// rooted at `head`) and, for every node whose callFrame.functionName
+// matches the given target, attributes its self+descendant selfSize back
+// to its immediate parent frame.
+//
+// Companion to find-callers.mjs (which does the same for .cpuprofile).
+// The tree shape means each occurrence has exactly one parent, so this
+// is straightforward depth-first attribution -- no need for the
+// parent-of map that find-callers.mjs builds.
+//
+// Usage:
+//   node perf/find-heap-callers.mjs <profile> <calleeName>
+//
+// Example:
+//   node perf/find-heap-callers.mjs results/<run>/process.heapprofile set
+//   node perf/find-heap-callers.mjs results/<run>/process.heapprofile Map
+//
+// `set` and `Map` show up as bare V8 builtins (no url, no line), so the
+// useful question is "who called them"; this script answers it.
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, targetName] = process.argv.slice(2);
+if (!profilePath || !targetName) {
+  console.error('usage: node find-heap-callers.mjs <profile> <calleeName>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+
+function subtreeBytes(n) {
+  let total = n.selfSize || 0;
+  for (const c of n.children || []) total += subtreeBytes(c);
+  return total;
+}
+
+const callerBytes = new Map();
+let targetSelf = 0;
+let targetTotal = 0;
+
+function walk(n, parent) {
+  const name = n.callFrame?.functionName || '';
+  if (name === targetName) {
+    targetSelf += n.selfSize || 0;
+    const total = subtreeBytes(n);
+    targetTotal += total;
+    if (parent) {
+      const cf = parent.callFrame || {};
+      const fn = cf.functionName || '(anon)';
+      const url = cf.url || '';
+      const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+      const pkey = `${fn} @ ${url ? url.replace(/^file:\/\/\//, '') : '(no url)'}:${line}`;
+      callerBytes.set(pkey, (callerBytes.get(pkey) || 0) + total);
+    }
+  }
+  for (const c of n.children || []) walk(c, n);
+}
+walk(profile.head, null);
+
+console.log(`${targetName}: self=${(targetSelf / 1024).toFixed(2)} KB, total=${(targetTotal / 1024).toFixed(2)} KB (${(targetTotal / 1024 / 1024).toFixed(2)} MB)`);
+console.log('callers (attributed total KB):');
+const rows = [...callerBytes.entries()].sort((a, b) => b[1] - a[1]);
+for (const [k, bytes] of rows) {
+  const kb = bytes / 1024;
+  if (kb < 1) continue;
+  console.log(`  ${kb.toFixed(2).padStart(10)} KB   ${k}`);
+}
diff --git a/perf/heap-subtree.mjs b/perf/heap-subtree.mjs
new file mode 100644
index 0000000..8aaa212
--- /dev/null
+++ b/perf/heap-subtree.mjs
@@ -0,0 +1,64 @@
+// "What does this frame actually allocate?" -- prints the heap-profile
+// subtree under any frame whose name matches a substring, with each
+// direct child's self + descendant byte total.
+//
+// Companion to analyze-heap.mjs (bottom-up flat list) and
+// find-heap-callers.mjs (who called this allocator). Use this when a
+// row in the top-15 looks suspicious -- e.g. a big self-size with
+// invisible children -- and you want to see what was inlined into the
+// frame's compiled code. Built during the PDFRef class-shape round,
+// where `maybeParseCrossRefSection` showed 3.4 MB self but its named
+// children totalled <40 KB; the subtree view confirmed V8 had
+// inlined `PDFCrossRefSection.addEntry` and attributed its object-
+// literal allocations to the parent frame.
+//
+// Usage:
+//   node heap-subtree.mjs <path/to/process.heapprofile> <function-name-substring>
+//
+// The substring matches case-sensitively on the V8 frame's
+// `functionName` field; all matches are reported, so a needle like
+// "parseDict" surfaces every frame containing that name.
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const [, , profilePath, needle] = process.argv;
+if (!profilePath || !needle) {
+  console.error('usage: node heap-subtree.mjs <process.heapprofile> <function-substring>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(resolve(profilePath), 'utf8'));
+
+function findNodes(node, out, depth = 0) {
+  const fn = (node.callFrame && node.callFrame.functionName) || '';
+  if (fn.includes(needle)) out.push(node);
+  for (const c of (node.children || [])) findNodes(c, out, depth + 1);
+}
+
+const matches = [];
+findNodes(profile.head, matches);
+console.log(`Found ${matches.length} matching frame(s)\n`);
+
+for (const m of matches) {
+  const cf = m.callFrame;
+  console.log(`=== ${cf.functionName}  @  ${cf.url}:${(cf.lineNumber||0)+1} ===`);
+  console.log(`self: ${(m.selfSize/1024).toFixed(2)} KB`);
+  console.log(`children (sorted by total):`);
+  const summarize = (n) => {
+    let total = n.selfSize;
+    for (const c of (n.children || [])) total += summarize(c);
+    n._total = total;
+    return total;
+  };
+  for (const c of (m.children || [])) summarize(c);
+  const sorted = (m.children || []).slice().sort((a, b) => b._total - a._total);
+  for (const c of sorted.slice(0, 12)) {
+    const cf = c.callFrame || {};
+    const fn = cf.functionName || '(anonymous)';
+    const url = cf.url || '';
+    const tail = url.split(/[\\/]/).slice(-2).join('/');
+    console.log(`  ${(c._total/1024).toFixed(2).padStart(10)} KB total | ${(c.selfSize/1024).toFixed(2).padStart(8)} KB self | ${fn}  @  ${tail}:${(cf.lineNumber||0)+1}`);
+  }
+  console.log('');
+}
diff --git a/perf/instrument-objclasses.mjs b/perf/instrument-objclasses.mjs
new file mode 100644
index 0000000..3c2860c
--- /dev/null
+++ b/perf/instrument-objclasses.mjs
@@ -0,0 +1,146 @@
+// Count instances of each PDF* class touched by a load on the book.
+//
+// Two views of "how many":
+//
+//   1. "Counted by .of()" -- every call to ClassName.of(...) regardless
+//      of whether the pool returned an existing instance. Tells you call
+//      frequency. Useful for spotting "PDFRef.of fires 1.4 M times per
+//      load" vs "only 226 k of those are unique" (the rest are pool
+//      hits).
+//   2. "Observed in indirectObjects after load" -- walks the loaded
+//      PDFContext.enumerateIndirectObjects() and bumps a counter per
+//      top-level object's runtime class. Inline (nested) PDFDict /
+//      PDFArray instances don't show up here; for those, use the
+//      heap-profile rows directly.
+//
+// Wired up to inform the class-constructor shape work in
+// fast-refs-class / fast-dict-onebuf / fast-array-onebuf. Output on
+// the book:
+//
+//   Counted by .of():
+//     PDFRef               1429034   (~226 k unique, rest pool hits)
+//     PDFNumber             284105   (~16 k unique)
+//     PDFName              1681225   (~4.8 k unique)
+//     PDFString               7375
+//     PDFRawStream            2061
+//
+//   Observed in indirectObjects after load:
+//     PDFCatalog                  1
+//     PDFPageTree               238
+//     PDFPageLeaf              1651
+//     PDFRawStream            2061
+//     PDFDict                220815   (top-level only; ~261 k incl. nested)
+//     PDFArray                1651   (top-level only; ~80 k incl. nested)
+//
+// To get unique counts on the pooled classes, see the throwaway snippet
+// in the "Class-constructor shapes" section of README.md (wraps PDFRef.of
+// / PDFName.of / PDFNumber.of with a Set-based dedupe).
+//
+// Run: node perf/instrument-objclasses.mjs
+
+import '../docs/lib/fast-refs-class.mjs';
+import '../docs/lib/fast-inflate.mjs';
+import '../docs/lib/fast-parse-number.mjs';
+import '../docs/lib/fast-decode-name.mjs';
+import '../docs/lib/fast-number-to-string.mjs';
+import '../docs/lib/fast-size-in-bytes.mjs';
+import '../docs/lib/fast-parse-object.mjs';
+import '../docs/lib/fast-sync-load.mjs';
+import '../docs/lib/fast-indirect-objects.mjs';
+import '../docs/lib/fast-pdfnumber-pool.mjs';
+import { setExpectedDictSlots } from '../docs/lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots } from '../docs/lib/fast-array-onebuf.mjs';
+import { measure as measureRawPdf } from '../docs/lib/measure-pass.mjs';
+import { PDFDocument } from 'pdf-lib';
+import { createRequire } from 'node:module';
+import { readFileSync } from 'node:fs';
+
+const require = createRequire(import.meta.url);
+
+const PDFRef        = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFName       = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNumber     = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFString     = require('pdf-lib/cjs/core/objects/PDFString.js').default;
+const PDFHexString  = require('pdf-lib/cjs/core/objects/PDFHexString.js').default;
+const PDFDict       = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFArray      = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFStream     = require('pdf-lib/cjs/core/objects/PDFStream.js').default;
+const PDFRawStream  = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFBool       = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFCatalog    = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree   = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf   = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFObjectStream    = require('pdf-lib/cjs/core/structures/PDFObjectStream.js').default;
+const PDFCrossRefStream  = require('pdf-lib/cjs/core/structures/PDFCrossRefStream.js').default;
+const PDFFlateStream     = require('pdf-lib/cjs/core/structures/PDFFlateStream.js').default;
+const PDFContentStream   = require('pdf-lib/cjs/core/structures/PDFContentStream.js').default;
+
+const counts = new Map();
+function track(name, Cls) {
+  counts.set(name, 0);
+  const origOf = Cls.of;
+  if (typeof origOf === 'function') {
+    Cls.of = function (...args) {
+      const r = origOf.apply(this, args);
+      counts.set(name, counts.get(name) + 1);
+      return r;
+    };
+  }
+}
+
+// Counting via .of for the pooled / factory-method classes. PDFDict
+// / PDFArray / PDFPageLeaf are constructed via the fast-dict-onebuf
+// and fast-array-onebuf factory paths; for those, the post-load walk
+// below scans PDFContext.enumerateIndirectObjects() instead.
+track('PDFRef',        PDFRef);
+track('PDFNumber',     PDFNumber);
+track('PDFName',       PDFName);
+track('PDFString',     PDFString);
+track('PDFHexString',  PDFHexString);
+track('PDFRawStream',  PDFRawStream);
+track('PDFObjectStream', PDFObjectStream);
+
+const rawPdf = readFileSync(new URL('./raw.pdf', import.meta.url));
+
+const dictCounts = measureRawPdf(rawPdf);
+setExpectedDictSlots(dictCounts.dictSlots);
+setExpectedArraySlots(dictCounts.arraySlots);
+
+const tBefore = Date.now();
+const doc = await PDFDocument.load(rawPdf);
+console.log('load:    ', Date.now() - tBefore, 'ms');
+
+// After-load count: scan indirectObjects for each class.
+const seen = new Map();
+function bump(name) { seen.set(name, (seen.get(name) || 0) + 1); }
+function walk(obj, depth = 0) {
+  if (obj == null) return;
+  // Identify class.
+  if (obj instanceof PDFCatalog)         bump('PDFCatalog');
+  else if (obj instanceof PDFPageTree)   bump('PDFPageTree');
+  else if (obj instanceof PDFPageLeaf)   bump('PDFPageLeaf');
+  else if (obj instanceof PDFObjectStream) bump('PDFObjectStream');
+  else if (obj instanceof PDFCrossRefStream) bump('PDFCrossRefStream');
+  else if (obj instanceof PDFFlateStream) bump('PDFFlateStream');
+  else if (obj instanceof PDFContentStream) bump('PDFContentStream');
+  else if (obj instanceof PDFRawStream)  bump('PDFRawStream');
+  else if (obj instanceof PDFStream)     bump('PDFStream');
+  else if (obj instanceof PDFDict)       bump('PDFDict');
+  else if (obj instanceof PDFArray)      bump('PDFArray');
+  else if (obj instanceof PDFName)       bump('PDFName');
+  else if (obj instanceof PDFNumber)     bump('PDFNumber');
+  else if (obj instanceof PDFString)     bump('PDFString');
+  else if (obj instanceof PDFHexString)  bump('PDFHexString');
+  else if (obj instanceof PDFBool)       bump('PDFBool');
+}
+for (const [, obj] of doc.context.enumerateIndirectObjects()) walk(obj);
+
+console.log('\nCounted by .of():');
+for (const [k, v] of counts) console.log('  ' + k.padEnd(20), v);
+
+console.log('\nObserved in indirectObjects after load:');
+const names = ['PDFCatalog','PDFPageTree','PDFPageLeaf','PDFObjectStream',
+  'PDFCrossRefStream','PDFFlateStream','PDFContentStream','PDFRawStream',
+  'PDFStream','PDFDict','PDFArray','PDFName','PDFNumber','PDFString',
+  'PDFHexString','PDFBool'];
+for (const n of names) console.log('  ' + n.padEnd(20), seen.get(n) || 0);
diff --git a/perf/instrument-parsedict.mjs b/perf/instrument-parsedict.mjs
new file mode 100644
index 0000000..356f2f7
--- /dev/null
+++ b/perf/instrument-parsedict.mjs
@@ -0,0 +1,67 @@
+// Per-call counters for fastParseDictArray. Wraps the shim's
+// parseDict to count invocations, entries-per-dict distribution,
+// and recursion depth. Prints a histogram on process exit.
+//
+// Used to crack open fastParseDictArray's 58 MB self-row in the
+// process-phase heap profile -- without counts, we can't tell
+// whether "58 MB" is 10k dicts at 6 KB each or 300k dicts at
+// 200 bytes each.
+//
+// Idempotent. Composes with --fast-dict-array (must be loaded
+// AFTER fast-dict-array so it wraps the patched parseDict).
+
+import { createRequire } from 'node:module';
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+
+if (!PDFObjectParser.prototype.__instrumentParseDictInstalled) {
+  const originalParseDict = PDFObjectParser.prototype.parseDict;
+  let totalCalls = 0;
+  let totalEntries = 0;
+  let maxSize = 0;
+  let depth = 0;
+  let maxDepth = 0;
+  const sizeHistogram = new Array(33).fill(0);  // [0..31] then 32+
+
+  PDFObjectParser.prototype.parseDict = function () {
+    depth++;
+    if (depth > maxDepth) maxDepth = depth;
+    let result;
+    try {
+      result = originalParseDict.call(this);
+    } finally {
+      depth--;
+    }
+    totalCalls++;
+    // result.dict is the flat array [k0, v0, k1, v1, ...] (fast-dict-array)
+    // or a Map (upstream / fast-parse-dict). Handle both.
+    const inner = result.dict;
+    const entryCount = Array.isArray(inner) ? (inner.length >> 1) : inner.size;
+    totalEntries += entryCount;
+    if (entryCount > maxSize) maxSize = entryCount;
+    const bucket = entryCount < 32 ? entryCount : 32;
+    sizeHistogram[bucket]++;
+    return result;
+  };
+
+  process.on('exit', () => {
+    console.error('');
+    console.error('=== parseDict instrumentation ===');
+    console.error(`total calls       : ${totalCalls}`);
+    console.error(`total entries     : ${totalEntries}`);
+    console.error(`avg entries/dict  : ${(totalEntries / totalCalls).toFixed(2)}`);
+    console.error(`max entries/dict  : ${maxSize}`);
+    console.error(`max recursion     : ${maxDepth}`);
+    console.error('entries-per-dict histogram:');
+    for (let i = 0; i <= 32; i++) {
+      const n = sizeHistogram[i];
+      if (n === 0) continue;
+      const label = i === 32 ? '32+' : String(i);
+      const bar = '#'.repeat(Math.min(60, Math.round(n / totalCalls * 200)));
+      console.error(`  ${label.padStart(4)} : ${String(n).padStart(7)}  ${bar}`);
+    }
+  });
+
+  PDFObjectParser.prototype.__instrumentParseDictInstalled = true;
+  console.log('[harness] instrument-parsedict: counting parseDict calls + size distribution');
+}
diff --git a/perf/instrument-pioh.mjs b/perf/instrument-pioh.mjs
new file mode 100644
index 0000000..30cc6e7
--- /dev/null
+++ b/perf/instrument-pioh.mjs
@@ -0,0 +1,98 @@
+// Count PDFParser.parseIndirectObjectHeader + matchIndirectObjectHeader
+// calls per load of perf/raw.pdf, plus the kept-heap delta across load.
+//
+// Background: the heap profile attributed ~9 MB of self-allocations to
+// parseIndirectObjectHeader -- enough to look like a real parser hot
+// spot. This script answers the prerequisite questions before
+// committing to an inline rewrite: how often is the function actually
+// called, does the speculative matchIndirectObjectHeader path fire on
+// the production shim stack (fast-sync-load's digit fast-path is
+// supposed to short-circuit it), and does parseIndirectObjectHeader
+// ever throw (recovery via matchIndirectObjectHeader's try/catch
+// wrapper)?
+//
+// Output, on the book (raw.pdf) with the current production shim stack:
+//   pioh calls:        226418
+//   pioh throws:       0
+//   mih  calls:        0           <- fast-sync-load short-circuit works
+//   heap delta (kept): ~35 MB
+//
+// The ~9 MB heap attribution turned out to be a V8 inlining-attribution
+// artifact (fastOf's PDFRef-construction bytes inlined into
+// parseIndirectObjectHeader's frame), not anything the function itself
+// allocates. Confirmed by re-profiling under `node --no-turbo-inlining`,
+// see "Class-constructor shapes for PDFRef / PDFDict / PDFArray" in
+// README.md. The fix wasn't in this function; it was in fast-refs's
+// wrapper construction (-> fast-refs-class).
+//
+// Run: node --expose-gc perf/instrument-pioh.mjs
+
+import '../docs/lib/fast-refs-class.mjs';
+import '../docs/lib/fast-inflate.mjs';
+import '../docs/lib/fast-parse-number.mjs';
+import '../docs/lib/fast-decode-name.mjs';
+import '../docs/lib/fast-number-to-string.mjs';
+import '../docs/lib/fast-size-in-bytes.mjs';
+import '../docs/lib/fast-parse-object.mjs';
+import '../docs/lib/fast-sync-load.mjs';
+import '../docs/lib/fast-indirect-objects.mjs';
+import '../docs/lib/fast-pdfnumber-pool.mjs';
+import { setExpectedDictSlots } from '../docs/lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots } from '../docs/lib/fast-array-onebuf.mjs';
+import { measure as measureRawPdf } from '../docs/lib/measure-pass.mjs';
+import { PDFDocument } from 'pdf-lib';
+import { createRequire } from 'node:module';
+import { readFileSync } from 'node:fs';
+
+const require = createRequire(import.meta.url);
+const PDFParser = require('pdf-lib/cjs/core/parser/PDFParser.js').default;
+
+const rawPdf = readFileSync(new URL('./raw.pdf', import.meta.url));
+
+// Wrap parseIndirectObjectHeader + matchIndirectObjectHeader with
+// counters. The throws counter tells us whether the function recovers
+// via matchIndirectObjectHeader's try/catch (a non-zero value would
+// mean speculation is firing on the production shim stack, which would
+// invalidate the "fast-sync-load short-circuit works" claim).
+let pioCalls = 0;
+let mihCalls = 0;
+let pioThrows = 0;
+const origPioh = PDFParser.prototype.parseIndirectObjectHeader;
+const origMih = PDFParser.prototype.matchIndirectObjectHeader;
+
+PDFParser.prototype.parseIndirectObjectHeader = function () {
+  pioCalls++;
+  try {
+    return origPioh.call(this);
+  } catch (e) {
+    pioThrows++;
+    throw e;
+  }
+};
+
+PDFParser.prototype.matchIndirectObjectHeader = function () {
+  mihCalls++;
+  return origMih.call(this);
+};
+
+// Warm up: do the measure pass + a single dry run to JIT.
+const counts = measureRawPdf(rawPdf);
+setExpectedDictSlots(counts.dictSlots);
+setExpectedArraySlots(counts.arraySlots);
+
+// Memory before.
+if (global.gc) global.gc();
+const heapBefore = process.memoryUsage().heapUsed;
+
+const tBefore = Date.now();
+const doc = await PDFDocument.load(rawPdf);
+const tAfter = Date.now();
+
+if (global.gc) global.gc();
+const heapAfter = process.memoryUsage().heapUsed;
+
+console.log('load time:        ', tAfter - tBefore, 'ms');
+console.log('pioh calls:       ', pioCalls);
+console.log('pioh throws:      ', pioThrows);
+console.log('mih  calls:       ', mihCalls);
+console.log('heap delta (kept):', ((heapAfter - heapBefore) / 1024 / 1024).toFixed(2), 'MB');
diff --git a/perf/instrument-slot-types.mjs b/perf/instrument-slot-types.mjs
new file mode 100644
index 0000000..5b33b5d
--- /dev/null
+++ b/perf/instrument-slot-types.mjs
@@ -0,0 +1,102 @@
+// Slot-type instrumentation for fast-dict-onebuf's `main` buffer.
+//
+// Walks main[0..mainLen) after the process phase is "done writing"
+// (i.e. after PDFDocument.load + setOutline; save reads but doesn't
+// write) and classifies each slot by its PDFObject subtype. main's
+// invariant is even-position = key, odd-position = value (each
+// committed frame is even-length, and mainLen always advances by an
+// even amount). So the histogram is broken into key-side and
+// value-side -- keys should be 100 % PDFName; values are the mixed
+// distribution Phase 2's encoding has to handle.
+//
+// Measurement-only. Imported when --instrument-slot-types is passed
+// to perf/measure.mjs (requires --fast-dict-onebuf since main lives
+// in that shim).
+
+import { createRequire } from 'node:module';
+import { main, getMainLen } from '../docs/lib/fast-dict-onebuf.mjs';
+
+const require = createRequire(import.meta.url);
+const PDFName      = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFRef       = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFNumber    = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFDict      = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFArray     = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFString    = require('pdf-lib/cjs/core/objects/PDFString.js').default;
+const PDFHexString = require('pdf-lib/cjs/core/objects/PDFHexString.js').default;
+const PDFBool      = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFNull      = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFRawStream = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFInvalid   = require('pdf-lib/cjs/core/objects/PDFInvalidObject.js').default;
+
+// Classify a single slot. Returns a string tag.
+// Order matters: subtypes before supertypes (PDFCatalog/PageTree/PageLeaf
+// extend PDFDict, so the PDFDict check catches them; PDFRawStream extends
+// PDFDict too but we check it first).
+function classify(v) {
+  if (v === undefined)        return 'undefined';
+  if (v === null)             return 'null';
+  if (v === PDFNull)          return 'PDFNull';
+  if (v === PDFBool.True)     return 'PDFBool.True';
+  if (v === PDFBool.False)    return 'PDFBool.False';
+  if (v instanceof PDFRef)         return 'PDFRef';
+  if (v instanceof PDFName)        return 'PDFName';
+  if (v instanceof PDFNumber)      return 'PDFNumber';
+  if (v instanceof PDFRawStream)   return 'PDFRawStream';
+  if (v instanceof PDFInvalid)     return 'PDFInvalidObject';
+  if (v instanceof PDFDict)        return 'PDFDict';
+  if (v instanceof PDFArray)       return 'PDFArray';
+  if (v instanceof PDFHexString)   return 'PDFHexString';
+  if (v instanceof PDFString)      return 'PDFString';
+  if (typeof v === 'number')  return 'number(raw)';
+  if (typeof v === 'string')  return 'string(raw)';
+  const ctor = v && v.constructor && v.constructor.name;
+  return `OTHER(${ctor || typeof v})`;
+}
+
+// Walk main, classify each slot. Returns {keys, values, total, keyTotal, valueTotal}.
+export function classifySlots() {
+  const mainLen = getMainLen();
+  const keys   = Object.create(null);
+  const values = Object.create(null);
+  let keyTotal = 0, valueTotal = 0;
+
+  for (let i = 0; i < mainLen; i++) {
+    const t = classify(main[i]);
+    if ((i & 1) === 0) {
+      keys[t] = (keys[t] || 0) + 1;
+      keyTotal++;
+    } else {
+      values[t] = (values[t] || 0) + 1;
+      valueTotal++;
+    }
+  }
+  return { keys, values, total: mainLen, keyTotal, valueTotal };
+}
+
+// Pretty-print, sorted by combined count descending.
+export function printHistogram(counts, label = '') {
+  const heading = label ? `[${label}] ` : '';
+  console.log(`${heading}slot classification: total=${counts.total}  keys=${counts.keyTotal}  values=${counts.valueTotal}`);
+  console.log('');
+
+  const allTypes = new Set([...Object.keys(counts.keys), ...Object.keys(counts.values)]);
+  const rows = [...allTypes].map(t => ({
+    type:    t,
+    keys:    counts.keys[t]   || 0,
+    values:  counts.values[t] || 0,
+    total:   (counts.keys[t] || 0) + (counts.values[t] || 0),
+  })).sort((a, b) => b.total - a.total);
+
+  console.log('  type               keys       key%       values     value%     total      total%');
+  console.log('  -----------------------------------------------------------------------------------');
+  const pct = (n, d) => d ? (100 * n / d).toFixed(2) : '0.00';
+  for (const r of rows) {
+    const kp = pct(r.keys, counts.keyTotal);
+    const vp = pct(r.values, counts.valueTotal);
+    const tp = pct(r.total, counts.total);
+    console.log(
+      `  ${r.type.padEnd(18)} ${r.keys.toString().padStart(8)}  ${kp.padStart(7)}%  ${r.values.toString().padStart(8)}  ${vp.padStart(7)}%  ${r.total.toString().padStart(8)}  ${tp.padStart(6)}%`
+    );
+  }
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index c3ec049..cb6e510 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -21,12 +21,21 @@
 //
 // Usage:
 //   node measure.mjs [path/to/book.html] [--out <dir>] [--keep-open]
-//                    [--cpu-profile] [--cpu-sampling <microseconds>]
-//                    [--heap-profile] [--heap-sampling <bytes>]
+//                    [--cpu-profile] [--cpu-profile-process]
+//                    [--cpu-sampling <microseconds>]
+//                    [--heap-profile] [--heap-profile-process]
+//                    [--heap-sampling <bytes>]
 //                    [--tracing]
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--timing]
 //                    [--clone-count] [--render-only]
+//                    [--fast-refs] [--parallel-deflate]
+//                    [--fast-decode-name] [--fast-number-to-string]
+//                    [--fast-size-in-bytes] [--fast-inflate]
+//                    [--fast-parse-number] [--fast-parse-dict]
+//                    [--fast-parse-object] [--fast-sync-load]
+//                    [--fast-dict-array] [--fast-indirect-objects]
+//                    [--fast-pdfnumber-pool]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -82,12 +91,160 @@
 // chrome://tracing or perfetto.dev, or run analyze-trace.mjs against it
 // for a top-N self-time table grouped by event name. Composable with
 // --cpu-profile; uses an independent CDP domain.
+//
+// --cpu-profile-process wraps the process phase only (pdf-lib roundtrip
+// or incremental writer) in a V8 Profiler trace via Node's inspector
+// module -- the process phase runs in Node, not Chromium, so CDP's
+// Profiler can't see it. Writes process.cpuprofile alongside render's.
+// Honours --cpu-sampling. Composable with --cpu-profile when you want
+// both phases captured in one run.
+//
+// --heap-profile-process wraps the process phase in V8's sampling heap
+// profiler (Inspector's HeapProfiler domain) and writes
+// process.heapprofile alongside the cpu one. --heap-sampling sets the
+// sampling interval in bytes; default 32768 (V8's default). Drop to
+// 512 for finer-grained attribution on short phases. Composable with
+// --cpu-profile-process; both share one inspector session.
+//
+// --fast-refs replaces PDFRef.of's string-keyed Map lookup with a
+// dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the
+// book). Eliminates the per-call `<obj> <gen> R` string allocation
+// and Map hash. gen != 0 calls (pdf-lib's xref-stream bookkeeping
+// for compressed objects) pass through unchanged.
+//
+// --parallel-deflate replaces pdfDoc.save() with parallelSave from
+// docs/lib/parallel-deflate.mjs: object streams are pre-deflated in
+// parallel on libuv's thread pool with objectsPerStream=500 (vs
+// pdf-lib's serial save with default 50). Moves ~300 ms of zlib work
+// off the main thread on the book.
+//
+// --fast-decode-name installs a parallel cache in front of PDFName.of
+// that skips the decodeName regex scan when the raw name contains
+// no `#` hex escape (which is 99.999 % of the ~2.8 M PDFName.of
+// calls per load on the book). ~150 ms saved on process load.
+//
+// --fast-number-to-string short-circuits pdf-lib's numberToString
+// when String(num) already lacks an `e`. Skips a redundant toString,
+// split, and parseInt per call; only the rare exponential-notation
+// tail still falls through to the original implementation.
+//
+// --fast-size-in-bytes replaces pdf-lib's utils.sizeInBytes -- which
+// allocates `n.toString(2)` just to count its bit length -- with a
+// non-allocating short-circuit ladder. Called ~300 k times per save
+// from PDFCrossRefStream's xref writer; the dominant inputs are
+// 1-2 byte values (type, gen, index, small obj-stream refs) so a
+// `n < 0x100 ? 1 : ...` ladder is the right shape.
+//
+// --fast-inflate swaps pako.inflate for node:zlib.inflateSync on
+// pdf-lib's one remaining pako call site (PDFCrossRefStreamParser
+// inflating the compressed cross-reference stream during
+// PDFDocument.load). One call per load, negligible wall-clock; flag
+// exists so paired A/Bs can compare against pure-pdf-lib behaviour.
+// Production runs through it.
+//
+// --fast-parse-number replaces pdf-lib's BaseParser.parseRawNumber
+// and parseRawInt with direct-integer accumulators (n = n*10 +
+// (byte - 0x30)) that skip per-byte string concatenation and the
+// trailing Number() round-trip. Every numeric token in a parsed
+// PDF flows through these; hundreds of thousands of calls per load
+// on the book. Production runs through it.
+//
+// --fast-parse-dict hoists the four sentinel PDFName.of calls
+// (Type / Catalog / Pages / Page) out of the type-dispatch tail
+// in PDFObjectParser.prototype.parseDict. The dispatch fires
+// per-dict (tens of thousands on the book) and even with
+// --fast-decode-name each lookup is still a Map.get on fastCache.
+// Pool-dedup makes the canonical PDFNames reference-stable, so
+// captured constants replace the four calls verbatim.
+//
+// --fast-parse-object replaces PDFObjectParser.prototype.parseObject
+// with a first-byte-dispatch version that gates the three
+// matchKeyword (true / false / null) scans behind a byte check.
+// parseObject fires per dict value / array element / indirect
+// object body (hundreds of thousands of calls on the book); the
+// upstream version pays three speculative matchKeyword fail-and-
+// rewind costs on every invocation. Same semantics, dispatch
+// reordered by observed frequency in dict-value position.
+//
+// --fast-dict-array replaces PDFDict's backing Map with a flat
+// alternating array [k0, v0, k1, v1, ...]. The sampling heap profile
+// showed `new Map()` + `Map.prototype.set` accounting for half the
+// process-phase allocations (~63 MB combined), 80 % of that traffic
+// from the parser's per-dict accumulator. The flat array is one
+// allocation per dict, no hash-table arena; lookups are linear scans
+// but PDF dicts are tiny (typically <= 10 entries). Subsumes
+// --fast-parse-dict and --fast-dict-iter (the parser's hot loop
+// accumulates into the array directly; sizeInBytes / copyBytesInto
+// iterate in place). Now superseded by --fast-dict-onebuf; kept as
+// an A/B baseline.
+//
+// --fast-dict-onebuf collapses the per-dict array allocation into
+// ONE long-lived mainBuf shared across every committed PDFDict
+// entry. A small per-parser temp array acts as a stack of parseDict
+// recursion frames so outer's range stays contiguous when inner
+// recurses. PDFDict instance state packs into one 53-bit Number
+// (24-bit start + 14-bit length + 1-bit owned), no per-dict array
+// header. Owned dicts (factory-created post-parse) append to main
+// and mutate in place / COW to the tail. PDFContext is a singleton
+// in our pipeline (one PDFDocument.load per process); a second
+// distinct context throws. Mutually exclusive with --fast-dict-array
+// and the other dict-shape shims. ~57 % cumulative heap reduction
+// since the original Map-backed PDFDict (152 -> 66 MB). Production
+// runs through it.
+//
+// --fast-indirect-objects replaces PDFContext.indirectObjects
+// (Map<PDFRef, PDFObject>) with a dense array indexed by
+// objectNumber for the gen=0 path -- mirror of the fast-refs trick
+// on the value side. After fast-dict-array shipped, that Map was
+// the last remaining hot Map.set in the heap profile (~14 MB of set
+// traffic from PDFContext.assign, fired once per indirect object
+// during load). gen!=0 PDFRefs fall through to the original Map.
+// enumerateIndirectObjects skips its sort when the gen!=0 Map is
+// empty (the parsed-PDF common case). Production runs through it.
+//
+// --fast-pdfnumber-pool installs a value-keyed cache in front of
+// PDFNumber.of. Dense array for non-negative integers in
+// [0, 16384), Map fallback for floats / negatives / out-of-range.
+// PDFs reuse the same numeric values (page indices, /Count, /N,
+// /MediaBox dimensions) tens-to-hundreds of thousands of times;
+// pooling collapses parseNumberOrRef's ~15 MB of PDFNumber
+// allocations to a few thousand cached instances. PDFNumber is
+// immutable so sharing is safe. Production runs through it.
+//
+// --measure-pass runs the no-allocate measure pass from
+// docs/lib/measure-pass.mjs against the raw Chrome PDF before
+// pdf-lib's load, and uses the measured dict-slot count to
+// pre-size fast-dict-onebuf's mainBuf to exact demand (no
+// V8-amortized growth, no slack). Phase 1 of the two-pass
+// measure-allocate-work architecture -- the win is purely the
+// plumbing landing byte-identical; Phase 2 (Float64Array mainBuf)
+// is where the GC mark cost actually drops. Requires
+// --fast-dict-onebuf (the only consumer of setExpectedDictSlots
+// so far). Adds ~135 ms to the process phase on the book.
+//
+// --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
+// shouldWaitForTick / waitForTick machinery out of both the load
+// path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
+// / parseIndirectObjects / parseIndirectObject +
+// PDFObjectStreamParser.parseIntoContext) and the save path
+// (PDFWriter.serializeToBuffer / computeBufferSize +
+// PDFStreamWriter.computeBufferSize). pdf-lib's TS downlevel wraps
+// each in tslib __awaiter / __generator so on browsers they can
+// `await waitForTick()` every `objectsPerTick` objects; with
+// objectsPerTick: Infinity (or the load path's parseSpeed: Fastest)
+// the gate never fires, but every indirect object still pays the
+// generator state-machine + Promise allocation. The shim removes
+// the scaffolding and the waitForTick yields entirely. Production
+// runs through it; the parseSpeed / objectsPerTick options are
+// dropped from PDFDocument.load / parallelSave / pdfDoc.save call
+// sites in step with this shim.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
 import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
+import { Session } from 'node:inspector/promises';
 import puppeteer from 'puppeteer';
-import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { PDFDocument } from 'pdf-lib';
 // Shared with docs/render-book.mjs -- the helpers and the paged.js
 // bundle live under docs/lib/ now that we've dropped the pagedjs-cli
 // dependency. Importing from there guarantees the harness measures the
@@ -96,6 +253,7 @@ import { parseOutline, setOutline } from '../docs/lib/outline.mjs';
 import { setMetadata }              from '../docs/lib/postprocesser.mjs';
 import { applyOutlineAndMetadataIncremental } from './incremental-pdf.mjs';
 import { pinCpuIfWindows } from './pin-cpu.mjs';
+import { parallelSave } from '../docs/lib/parallel-deflate.mjs';
 
 // On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
 // CPU sample-time. See pin-cpu.mjs. Cuts run-to-run variance from
@@ -109,9 +267,11 @@ let inputArg = null;
 let outArg = null;
 let keepOpen = false;
 let cpuProfile = false;
+let cpuProfileProcess = false;
 let cpuSampling = 1000; // microseconds
 let heapProfile = false;
-let heapSampling = 32768; // bytes between samples (CDP default)
+let heapProfileProcess = false;
+let heapSampling = 32768; // bytes between samples (V8 default; used by both CDP render-side and inspector process-side)
 let detachPages = true;
 let instrument = false;
 let timeHooks = false;
@@ -121,13 +281,37 @@ let timing = false;
 let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
+let fastRefs = false;
+let fastRefsClass = false;
+let parallelDeflate = false;
+let fastDecodeName = false;
+let fastNumberToString = false;
+let fastSizeInBytes = false;
+let fastInflate = false;
+let fastParseNumber = false;
+let fastDictIter = false;
+let fastParseDict = false;
+let fastParseObject = false;
+let fastParseName = false;
+let fastSyncLoad = false;
+let fastDictArray = false;
+let fastIndirectObjects = false;
+let fastPdfnumberPool = false;
+let fastDictOnebuf = false;
+let fastArrayOnebuf = false;
+let instrumentParsedict = false;
+let dumpRawPdf = null;
+let measurePass = false;
+let instrumentSlotTypes = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
   else if (a === '--keep-open') keepOpen = true;
   else if (a === '--cpu-profile') cpuProfile = true;
+  else if (a === '--cpu-profile-process') cpuProfileProcess = true;
   else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10);
   else if (a === '--heap-profile') heapProfile = true;
+  else if (a === '--heap-profile-process') heapProfileProcess = true;
   else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10);
   else if (a === '--detach-pages') detachPages = true;       // accepted for backwards compat; default since the fix landed
   else if (a === '--no-detach-pages') detachPages = false;
@@ -141,6 +325,28 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--render-only') renderOnly = true;
   else if (a === '--tracing') tracing = true;
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
+  else if (a === '--fast-refs') fastRefs = true;
+  else if (a === '--fast-refs-class') fastRefsClass = true;
+  else if (a === '--parallel-deflate') parallelDeflate = true;
+  else if (a === '--fast-decode-name') fastDecodeName = true;
+  else if (a === '--fast-number-to-string') fastNumberToString = true;
+  else if (a === '--fast-size-in-bytes') fastSizeInBytes = true;
+  else if (a === '--fast-inflate') fastInflate = true;
+  else if (a === '--fast-parse-number') fastParseNumber = true;
+  else if (a === '--fast-dict-iter') fastDictIter = true;
+  else if (a === '--fast-parse-dict') fastParseDict = true;
+  else if (a === '--fast-parse-object') fastParseObject = true;
+  else if (a === '--fast-parse-name') fastParseName = true;
+  else if (a === '--fast-sync-load') fastSyncLoad = true;
+  else if (a === '--fast-dict-array') fastDictArray = true;
+  else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
+  else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
+  else if (a === '--fast-dict-onebuf') fastDictOnebuf = true;
+  else if (a === '--fast-array-onebuf') fastArrayOnebuf = true;
+  else if (a === '--instrument-parsedict') instrumentParsedict = true;
+  else if (a === '--dump-raw-pdf') dumpRawPdf = args[++i];
+  else if (a === '--measure-pass') measurePass = true;
+  else if (a === '--instrument-slot-types') instrumentSlotTypes = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -175,6 +381,156 @@ for (const p of required) {
   }
 }
 
+if (cpuProfileProcess && renderOnly) {
+  console.error('--cpu-profile-process is incompatible with --render-only (the process phase is skipped).');
+  process.exit(2);
+}
+if (heapProfileProcess && renderOnly) {
+  console.error('--heap-profile-process is incompatible with --render-only (the process phase is skipped).');
+  process.exit(2);
+}
+if (fastDictArray && (fastParseDict || fastDictIter)) {
+  console.error('--fast-dict-array subsumes --fast-parse-dict and --fast-dict-iter (Map-backed shims). Pick one shape.');
+  process.exit(2);
+}
+if (fastDictOnebuf && (fastDictArray || fastParseDict || fastDictIter)) {
+  console.error('--fast-dict-onebuf subsumes the other dict-shape shims (different storage shape). Pick one.');
+  process.exit(2);
+}
+if (measurePass && !fastDictOnebuf) {
+  console.error('--measure-pass requires --fast-dict-onebuf (the only shim that consumes setExpectedDictSlots so far).');
+  process.exit(2);
+}
+if (measurePass && incremental) {
+  console.error('--measure-pass operates on the pdf-lib load path; --incremental skips that path entirely.');
+  process.exit(2);
+}
+if (measurePass && renderOnly) {
+  console.error('--measure-pass needs the process phase; --render-only skips it.');
+  process.exit(2);
+}
+if (instrumentSlotTypes && !fastDictOnebuf) {
+  console.error('--instrument-slot-types reads fast-dict-onebuf\'s main buffer; pass --fast-dict-onebuf too.');
+  process.exit(2);
+}
+if (instrumentSlotTypes && (incremental || renderOnly)) {
+  console.error('--instrument-slot-types needs the process phase; not compatible with --incremental or --render-only.');
+  process.exit(2);
+}
+
+// Install the dense-array cache for PDFRef.of's gen=0 path before any
+// pdf-lib operation. Side-effecting import; idempotent.
+if (fastRefs && fastRefsClass) {
+  console.error('--fast-refs and --fast-refs-class are mutually exclusive (both shim PDFRef.of).');
+  process.exit(2);
+}
+if (fastRefs) {
+  await import('../docs/lib/fast-refs.mjs');
+  console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0');
+}
+if (fastRefsClass) {
+  await import('../docs/lib/fast-refs-class.mjs');
+  console.log('[harness] fast-refs-class: PDFRef.of dense-array cache + class-constructor shape');
+}
+if (fastDecodeName) {
+  await import('../docs/lib/fast-decode-name.mjs');
+  console.log('[harness] fast-decode-name: skip decodeName regex when name has no #');
+}
+if (fastNumberToString) {
+  await import('../docs/lib/fast-number-to-string.mjs');
+  console.log('[harness] fast-number-to-string: skip redundant toString/split when no exponential');
+}
+if (fastSizeInBytes) {
+  await import('../docs/lib/fast-size-in-bytes.mjs');
+  console.log('[harness] fast-size-in-bytes: non-allocating ladder for xref byte-width');
+}
+if (fastInflate) {
+  await import('../docs/lib/fast-inflate.mjs');
+  console.log('[harness] fast-inflate: swap pako.inflate for node:zlib.inflateSync');
+}
+if (fastParseNumber) {
+  await import('../docs/lib/fast-parse-number.mjs');
+  console.log('[harness] fast-parse-number: direct-integer accumulator for parseRawNumber/parseRawInt');
+}
+if (fastDictIter) {
+  await import('../docs/lib/fast-dict-iter.mjs');
+  console.log('[harness] fast-dict-iter: in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto');
+}
+if (fastParseDict) {
+  await import('../docs/lib/fast-parse-dict.mjs');
+  console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict');
+}
+if (fastParseObject) {
+  await import('../docs/lib/fast-parse-object.mjs');
+  console.log('[harness] fast-parse-object: first-byte dispatch in parseObject, gate true/false/null matchKeyword behind byte check');
+}
+if (fastParseName) {
+  await import('../docs/lib/fast-parse-name.mjs');
+  console.log('[harness] fast-parse-name: byte-slice + String.fromCharCode build for PDFObjectParser.parseName');
+}
+if (fastSyncLoad) {
+  await import('../docs/lib/fast-sync-load.mjs');
+  console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery');
+}
+if (fastDictArray) {
+  await import('../docs/lib/fast-dict-array.mjs');
+  console.log('[harness] fast-dict-array: PDFDict backed by flat alternating array (subsumes fast-parse-dict + fast-dict-iter)');
+}
+if (fastIndirectObjects) {
+  await import('../docs/lib/fast-indirect-objects.mjs');
+  console.log('[harness] fast-indirect-objects: PDFContext.indirectObjects dense-array cache for gen=0 PDFRefs');
+}
+if (fastPdfnumberPool) {
+  await import('../docs/lib/fast-pdfnumber-pool.mjs');
+  console.log('[harness] fast-pdfnumber-pool: value-keyed cache in front of PDFNumber.of');
+}
+if (fastDictOnebuf) {
+  await import('../docs/lib/fast-dict-onebuf.mjs');
+  console.log('[harness] fast-dict-onebuf: ONE long-lived buffer for all PDFDict entries + small per-parser temp');
+}
+if (fastArrayOnebuf) {
+  await import('../docs/lib/fast-array-onebuf.mjs');
+  console.log('[harness] fast-array-onebuf: ONE long-lived buffer for all PDFArray elements + small per-parser temp');
+}
+if (instrumentParsedict) {
+  await import('./instrument-parsedict.mjs');
+}
+
+// --measure-pass loads the measure walker and the setter; both are
+// invoked in-flight (after rawPdf is in hand, before PDFDocument.load).
+let _runMeasurePass = null;
+if (measurePass) {
+  const { measure } = await import('../docs/lib/measure-pass.mjs');
+  const { setExpectedDictSlots } = await import('../docs/lib/fast-dict-onebuf.mjs');
+  let setExpectedArraySlots = null;
+  if (fastArrayOnebuf) {
+    const ma = await import('../docs/lib/fast-array-onebuf.mjs');
+    setExpectedArraySlots = ma.setExpectedArraySlots;
+  }
+  _runMeasurePass = (bytes) => {
+    const counts = measure(bytes);
+    setExpectedDictSlots(counts.dictSlots);
+    if (setExpectedArraySlots) setExpectedArraySlots(counts.arraySlots);
+    return counts;
+  };
+  console.log(
+    setExpectedArraySlots
+      ? '[harness] measure-pass: no-allocate prelude, pre-sizes dict + array main buffers to measured slot counts'
+      : '[harness] measure-pass: no-allocate prelude, pre-sizes fast-dict-onebuf mainBuf to measured dict-slot count',
+  );
+}
+
+// --instrument-slot-types loads the slot-type classifier; called after
+// setOutline, before save.
+let _classifySlots = null;
+let _printSlotHistogram = null;
+if (instrumentSlotTypes) {
+  const m = await import('./instrument-slot-types.mjs');
+  _classifySlots = m.classifySlots;
+  _printSlotHistogram = m.printHistogram;
+  console.log('[harness] instrument-slot-types: classify main[] slots by PDFObject subtype after setOutline');
+}
+
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
   ? resolve(process.cwd(), outArg)
@@ -370,6 +726,8 @@ try {
   let rawPdfBytes = null;
   let processMs = null;
   let processBreakdown = null;
+  let processProfilePath = null;
+  let processHeapProfilePath = null;
   let finalPdf = null;
 
   if (!renderOnly) {
@@ -413,6 +771,13 @@ try {
   pdfMs = Date.now() - tPdfStart;
   rawPdfBytes = rawPdf.length;
 
+  if (dumpRawPdf) {
+    const dumpPath = resolve(process.cwd(), dumpRawPdf);
+    mkdirSync(dirname(dumpPath), { recursive: true });
+    writeFileSync(dumpPath, Buffer.from(rawPdf));
+    console.log(`[harness] dumped raw Chrome PDF: ${dumpPath} (${(rawPdf.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
+
   const tGenEnd = Date.now();
   generateMs = tGenEnd - tGenStart;
   console.log(`[harness] generate ${fmtMs(generateMs)}  (parseOutline=${fmtMs(parseOutlineMs)}, page.pdf=${fmtMs(pdfMs)}, ${(rawPdf.length / 1024 / 1024).toFixed(1)}MB)`);
@@ -430,6 +795,29 @@ try {
   //
   // Either way we time the full phase plus the meaningful sub-steps so the
   // breakdown matches across runs.
+  //
+  // --cpu-profile-process attaches Node's inspector Profiler around this
+  // block. The render phase profiles via CDP because the work happens in
+  // Chromium; the process phase profiles via Node's inspector because
+  // pdf-lib runs locally. Output file shape (V8 .cpuprofile JSON) is the
+  // same either way.
+  let inspectorSession = null;
+  if (cpuProfileProcess || heapProfileProcess) {
+    inspectorSession = new Session();
+    inspectorSession.connect();
+  }
+  if (cpuProfileProcess) {
+    await inspectorSession.post('Profiler.enable');
+    await inspectorSession.post('Profiler.setSamplingInterval', { interval: cpuSampling });
+    await inspectorSession.post('Profiler.start');
+    console.log(`[harness] process cpu profile: sampling every ${cpuSampling}us`);
+  }
+  if (heapProfileProcess) {
+    await inspectorSession.post('HeapProfiler.enable');
+    await inspectorSession.post('HeapProfiler.startSampling', { samplingInterval: heapSampling });
+    console.log(`[harness] process heap profile: sampling every ${heapSampling}B`);
+  }
+
   const tProcStart = Date.now();
   if (incremental) {
     const tIncStart = Date.now();
@@ -438,15 +826,26 @@ try {
     finalPdf = bytes;
     processBreakdown = { incrementalMs: incMs, ...stats };
   } else {
-    // pdf-lib's defaults are catastrophically slow: parseSpeed=Slow (100
-    // objects/tick) and objectsPerTick=50 both yield to the event loop
-    // between batches, turning a ~2s load into ~36s on a 52 MB PDF (~34s
-    // pure idle in the cpuprofile). Override to Fastest/Infinity so the
-    // "baseline" we report reflects the library's actual CPU cost, not
-    // an artefact of yielding cadence. The harness has no parallel work
-    // to make space for, so cooperative yielding is pure overhead here.
+    // Upstream pdf-lib's load yields to the event loop every
+    // `parseSpeed` objects via `await waitForTick()`; the save side
+    // does the same every `objectsPerTick`. With --fast-sync-load on
+    // (the production default) both yield gates are ripped out -- the
+    // option arguments are silently ignored, so we don't bother
+    // passing them. Without --fast-sync-load, the run measures pdf-lib's
+    // cautious defaults (parseSpeed: Slow, objectsPerTick: 50) which
+    // yield ~500 / ~1000 times per phase on the book; that's pdf-lib's
+    // out-of-the-box behaviour, useful as a baseline for A/B work.
+    let measurePassMs = 0;
+    let measureCounts = null;
+    if (_runMeasurePass) {
+      const tMeasureStart = Date.now();
+      measureCounts = _runMeasurePass(rawPdf);
+      measurePassMs = Date.now() - tMeasureStart;
+      console.log(`[harness] measure-pass ${fmtMs(measurePassMs)}  (dicts=${measureCounts.dicts}, dictSlots=${measureCounts.dictSlots}, maxRecursion=${measureCounts.maxRecursion})`);
+    }
+
     const tLoadStart = Date.now();
-    const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest });
+    const pdfDoc = await PDFDocument.load(rawPdf);
     const loadMs = Date.now() - tLoadStart;
 
     setMetadata(pdfDoc, meta);
@@ -455,18 +854,59 @@ try {
     setOutline(pdfDoc, outline, false);
     const setOutlineMs = Date.now() - tSetOutlineStart;
 
+    if (_classifySlots) {
+      const tClassifyStart = Date.now();
+      const slotCounts = _classifySlots();
+      const classifyMs = Date.now() - tClassifyStart;
+      console.log(`[harness] instrument-slot-types: classify took ${classifyMs}ms`);
+      console.log('');
+      _printSlotHistogram(slotCounts, 'main after load+setOutline');
+      console.log('');
+    }
+
     const tSaveStart = Date.now();
-    finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+    let parallelStreamCount = 0;
+    if (parallelDeflate) {
+      const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerStream: 500 });
+      finalPdf = bytes;
+      parallelStreamCount = streamCount;
+    } else {
+      finalPdf = await pdfDoc.save();
+    }
     const saveMs = Date.now() - tSaveStart;
 
-    processBreakdown = { loadMs, setOutlineMs, saveMs };
+    processBreakdown = { measurePassMs, loadMs, setOutlineMs, saveMs, parallelStreamCount };
+    if (measureCounts) processBreakdown.measureCounts = measureCounts;
   }
   const tProcEnd  = Date.now();
   processMs = tProcEnd - tProcStart;
+  if (heapProfileProcess) {
+    const { profile } = await inspectorSession.post('HeapProfiler.stopSampling');
+    await inspectorSession.post('HeapProfiler.disable');
+    processHeapProfilePath = join(outDir, 'process.heapprofile');
+    const profileJson = JSON.stringify(profile);
+    writeFileSync(processHeapProfilePath, profileJson);
+    console.log(`[harness] process heap profile: ${processHeapProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
+  if (cpuProfileProcess) {
+    const { profile } = await inspectorSession.post('Profiler.stop');
+    await inspectorSession.post('Profiler.disable');
+    processProfilePath = join(outDir, 'process.cpuprofile');
+    const profileJson = JSON.stringify(profile);
+    writeFileSync(processProfilePath, profileJson);
+    console.log(`[harness] process cpu profile: ${processProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
+  if (inspectorSession) inspectorSession.disconnect();
   if (incremental) {
     console.log(`[harness] process  ${fmtMs(processMs)}  (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`);
   } else {
-    console.log(`[harness] process  ${fmtMs(processMs)}  (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)})`);
+    const parTag = processBreakdown.parallelStreamCount
+      ? ` (parallel-deflate: ${processBreakdown.parallelStreamCount} streams)`
+      : '';
+    const measureTag = processBreakdown.measurePassMs
+      ? `measure=${fmtMs(processBreakdown.measurePassMs)}, `
+      : '';
+    console.log(`[harness] process  ${fmtMs(processMs)}  (${measureTag}load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)}${parTag})`);
   }
   }  // end if (!renderOnly)
 
@@ -506,6 +946,8 @@ try {
     record.phases.process = {
       ms: processMs,
       mode: incremental ? 'incremental' : 'pdf-lib-roundtrip',
+      cpuProfile: processProfilePath,
+      heapProfile: processHeapProfilePath,
       ...processBreakdown,
     };
   }
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
new file mode 100644
index 0000000..eae3a11
--- /dev/null
+++ b/perf/notes/08-pdf-lib.md
@@ -0,0 +1,5118 @@
+# pdf-lib: profiling the process phase
+
+Wiring `--cpu-profile-process` so the pdf-lib roundtrip becomes visible to the same `analyze-profile.mjs` toolchain we already use on the render phase, then following the bottom-up table -- pako dominates with per-stream init overhead, routing `pako.deflate` through `node:zlib` saves ~1.5 s of process wall (save -58 %).
+
+The render-side investigations (notes [01](01-baseline-and-detach.md)
+through [07](07-memory.md)) brought render down from ~104 s to ~8 s
+and process from ~40 s to ~5 s. By [`pdf-lib parseSpeed: Fastest`](01-baseline-and-detach.md)
+the process phase was a flat ~5 s of `load + setOutline + save`, the
+sub-step numbers were the only thing we knew about it, and there was
+no bottom-up table to point at: CDP's `Profiler` attaches to Chromium
+and the process phase runs in Node, so `--cpu-profile` couldn't see
+it.
+
+## `--cpu-profile-process` (and `--heap-profile-process`)
+
+Added to `measure.mjs`: opens an in-process V8 Profiler via
+`node:inspector/promises`, brackets the process phase the same way
+`--cpu-profile` brackets render, and writes `process.cpuprofile`
+alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so
+the existing `analyze-profile.mjs` / `find-callers.mjs` /
+`find-callees.mjs` work unchanged. See the *Profiling pdf-lib
+(process phase): canonical command* section in [the README](../README.md)
+for the operational form.
+
+The heap counterpart -- `--heap-profile-process` -- arrived later
+(once allocation became the obvious next thing to attack: GC was
+sitting at the top of every CPU profile in this phase). It shares
+the same inspector session, so capturing both in one run is one
+flag away. Output is a `.heapprofile`, a tree of
+`{ callFrame, selfSize, children }` rooted at `head` -- *not* the
+flat `.cpuprofile` shape -- so `analyze-heap-profile.mjs` handles
+it instead of the cpu analyzers. See *Profiling pdf-lib heap
+allocation (process phase): canonical command* in
+[the README](../README.md) for the operational form. The findings
+this tool enabled are folded into the per-shim sections below
+(decodeName / sizeInBytes / PDFDict.entries / ...) -- each names
+which path it came from when the heap profile, not the cpu
+profile, was the diagnostic that pointed at the function.
+
+First run on the 1638-page book (`--detach-pages --no-timing
+--cpu-profile-process --cpu-sampling 100`), process 4.66 s (load
+1.88 s, setOutline 0.01 s, save 2.77 s). Top of the bottom-up table:
+
+```
+samples: 8560   duration: 4.68s   us/sample: 547
+
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+    645.24   13.85%   (garbage collector)
+    460.42    9.88%   longest_match            pako/lib/zlib/deflate.js:231
+    428.15    9.19%   deflateInit2             pako/lib/zlib/deflate.js:1327
+    374.02    8.03%   PDFRef.of                pdf-lib/.../PDFRef.js:34
+    218.73    4.69%   decodeName               pdf-lib/.../PDFName.js:9
+    218.73    4.69%   PDFDict.entries          pdf-lib/.../PDFDict.js:22
+    182.64    3.92%   deflate_slow             pako/lib/zlib/deflate.js:726
+    119.75    2.57%   parseRawNumber           pdf-lib/.../BaseParser.js:33
+    114.28    2.45%   DeflateState             pako/lib/zlib/deflate.js:1092
+    113.19    2.43%   parseName                pdf-lib/.../PDFObjectParser.js:117
+     ... pako rows and parser rows continue down the table ...
+```
+
+Adding up pako frames (`longest_match` + `deflateInit2` +
+`deflate_slow` + `DeflateState` + `lm_init` + `compress_block` +
+`build_tree` + `Deflate.push` + `adler32`) lands at **~1.42 s, ~30 %
+of the process phase**. Of that, the *initialization* group
+(`deflateInit2` + `DeflateState` + `lm_init`) was **~628 ms** -- so
+~44 % of pako's time was spent setting up Deflate state, not
+compressing bytes. That number per call doesn't explain itself
+unless the call count is high.
+
+## Are we compressing Chrome's already-compressed streams?
+
+Reasonable hypothesis: pdf-lib loads, decompresses Chrome's content
+streams, and then re-compresses them on save. That would put Chrome's
+~52 MB of content through deflate twice, and explain the heavy
+pako time as wasted work.
+
+Walking the code:
+
+- `PDFObjectParser.parseDictOrStream` (`pdf-lib/.../parser/PDFObjectParser.js:171`)
+  always ends with `return PDFRawStream.of(dict, contents)`. Every
+  stream pdf-lib parses out of the input is a `PDFRawStream` holding
+  the verbatim bytes between `stream` / `endstream`. No decompression.
+- `PDFRawStream.getContents` (`pdf-lib/.../objects/PDFRawStream.js:22`)
+  returns those bytes unchanged.
+- `PDFStreamWriter.computeBufferSize` (`pdf-lib/.../writers/PDFStreamWriter.js:43-46`)
+  marks `shouldNotCompress = true` for anything that's `instanceof
+  PDFStream` (which includes `PDFRawStream`). Those go out verbatim
+  with the original `/Filter` preserved.
+
+`pako.deflate` lives in `PDFFlateStream.computeContents`
+(`pdf-lib/.../structures/PDFFlateStream.js:15`); the only subclasses
+are `PDFContentStream`, `PDFCrossRefStream`, and `PDFObjectStream`.
+None of those are instantiated by the parser. So **Chrome's content
+streams ride through as `PDFRawStream` and never see pako**.
+
+Confirmed by instrumenting `pako.deflate` and re-running the save
+on the produced book.pdf:
+
+```
+deflate calls during save : 4524
+bytes fed to deflate      : 24.28 MB
+bytes produced            :  4.39 MB
+final pdf size            : 16.08 MB
+```
+
+The 4,524 deflate calls are pdf-lib's **own** new streams:
+
+- ~4,523 `PDFObjectStream` chunks. `PDFStreamWriter.forContext`
+  defaults to `objectsPerStream = 50`; the book has **228,191
+  indirect objects**, so pdf-lib packs ~4,564 chunks of 50 each.
+- 1 `PDFCrossRefStream` for the xref.
+
+## Wait -- the pdf-lib output is *smaller* than Chrome's. What's going on?
+
+Chrome's raw PDF is 39.3 MB, pdf-lib's final PDF is 16.1 MB. That
+23 MB shrink isn't pdf-lib throwing anything away -- it's compressing
+something Chrome chose to emit verbatim.
+
+Tallying the 228,191 indirect objects pdf-lib sees by type:
+
+```
+130,787  StructElem /S=/NonStruct      (a11y wrapper around content w/o structural role)
+ 22,193  StructElem /S=/Strong         (bold)
+ 11,003  Dict /Type=/Annot             (mostly hyperlinks)
+ 10,054  StructElem /S=/Link
+  9,164  StructElem /S=/P              (paragraph)
+  8,417  StructElem /S=/Em             (emphasis)
+  5,270  StructElem /S=/TD             (table cell)
+  4,822  StructElem /S=/Code
+  3,392  StructElem /S=/LI             (list item)
+  3,040  StructElem /S=/H5
+    ... another ~15 k StructElems in long tail (H1-H6, L, TR, Art, ...)
+  2,061  PDFRawStream                  (Chrome's content + font + image streams)
+  1,651  Dict /Type=/Page
+   ... ~3.5 k misc dicts ...
+```
+
+**Over 225,000 are tiny `<<...>>` StructElem dicts** -- the
+tagged-PDF structure tree, which Chrome emits because we pass
+`tagged: true` to `page.pdf()`. Each `StructElem` is something like
+`<</Type /StructElem /S /P /P [123 0 R] /K [...] /Pg 5 0 R>>` -- a
+few hundred bytes of mostly boilerplate.
+
+Chrome writes them as plain text indirect objects -- 225k × a few
+hundred bytes ≈ 28 MB of `<<...>>` source. pdf-lib's
+`PDFStreamWriter` packs those 50 at a time into PDFObjectStreams,
+each of which is then deflate-compressed. The dict syntax is wildly
+repetitive across siblings (`/Type /StructElem` literally appears
+225k times), so deflate compresses the packed text ~5.5x. The
+24.28 MB of small-dict text fed to deflate above comes out the
+other side at 4.39 MB. Add the ~11 MB of `PDFRawStream` bytes that
+pass through verbatim, plus a few KB of misc, and the 16.1 MB total
+checks out.
+
+The pdf-lib roundtrip's win over Chrome's raw output is **encoding
+the same information** in PDF 1.5's compressed-object-streams
+feature instead of as plain `<<...>>` text. Skia's PDF writer
+chooses not to use that feature.
+
+This also explains the pako profile shape. The workload is *many
+small streams* (~4,500 of them at ~5.4 KB input each), which is
+exactly where per-stream initialization dominates: the 628 ms in
+`deflateInit2` + `DeflateState` + `lm_init` is paid 4,500 times,
+while the per-call payload is small enough that the actual
+compression work (~755 ms across `longest_match` + `deflate_slow`
++ `compress_block` + `build_tree` + `adler32`) isn't proportionally
+larger.
+
+## The shim
+
+PDF `/FlateDecode` (ISO 32000-1 §7.4.4) is the zlib format
+(RFC 1950): 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte
+Adler-32 trailer. Both `pako.deflate(data)` and Node's
+`zlib.deflateSync(data)` produce that format at default level 6.
+Verified head-to-head: each compresses to an equivalent-size zlib
+stream starting `78 9c`, and either can decompress the other's
+output back to the original input bytes.
+
+`docs/lib/fast-deflate.mjs` is a side-effecting import that mutates
+the live `pako` exports:
+
+```js
+import { deflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastDeflateInstalled) {
+  const original = pako.deflate;
+  pako.deflate = function fastDeflate(data, options) {
+    if (options) return original.call(pako, data, options);
+    return deflateSync(data);
+  };
+  pako.__fastDeflateInstalled = true;
+}
+```
+
+pdf-lib's CJS code reads `require("pako").deflate` at call time
+(`pako_1.default.deflate(unencodedContents)` inside
+`PDFFlateStream.computeContents`), so mutating the live module
+exports propagates without forking pdf-lib. The `options`
+fallthrough means any caller that needs pako's non-default
+behaviour (dictionaries, raw deflate, custom level) is unaffected;
+pdf-lib's only call site passes no options.
+
+Microbenchmark on the harness machine, both unrelated to the book:
+
+```
+zlib.deflateSync(50 MB of ASCII)                        112 ms
+zlib.deflateSync(book.pdf as input, 16.1 MB)            283 ms
+```
+
+For comparison, pako spent ~1.42 s on the book's actual save
+workload (~24 MB across 4,524 calls). Same order of magnitude as
+the raw-throughput numbers above, but with more per-call overhead
+-- which matches what a JS implementation is expected to lose
+against C when amortised across many small calls.
+
+`docs/render-book.mjs` imports the shim unconditionally near its
+pdf-lib import; production runs through it. `measure.mjs` adds a
+`--fast-deflate` flag, opt-in in the harness so paired pre/post
+A/Bs are still easy.
+
+## Results
+
+Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`) with
+`--detach-pages --no-timing --cpu-profile-process --cpu-sampling
+100`, same 1638-page book each:
+
+| metric        | pre1   | pre2   | pre avg | post1  | post2  | post avg | Δ                |
+| ------------- | ------ | ------ | ------- | ------ | ------ | -------- | ---------------- |
+| **process**   | 4.20 s | 4.27 s | **4.24 s** | 2.79 s | 2.74 s | **2.77 s** | **-1.47 s (-35 %)** |
+| ↳ load        | 1.53 s | 1.54 s | 1.54 s  | 1.67 s | 1.61 s | 1.64 s   | +0.10 s (noise; load goes through `pako.inflate`, untouched) |
+| ↳ setOutline  | 0.01 s | 0.01 s | 0.01 s  | 0.01 s | 0.01 s | 0.01 s   | unchanged |
+| ↳ **save**    | 2.66 s | 2.72 s | **2.69 s** | 1.11 s | 1.12 s | **1.12 s** | **-1.57 s (-58 %)** |
+| pdf size      | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | identical |
+
+Render and generate wall-clock numbers varied ±5 s between runs
+(machine load) but the process numbers are tight to ±0.05 s.
+
+Post-fix bottom-up profile, same flags:
+
+```
+samples: 5229   duration: 2.82s   us/sample: 540
+
+   self_ms   self_%   function
+   -------   ------   --------------------------------------------------
+    348.83   12.48%   writeSync                  (Node libuv syscall)
+    335.87   12.01%   PDFRef.of                  pdf-lib/.../PDFRef.js:34
+    262.44    9.39%   (garbage collector)
+    165.24    5.91%   PDFDict.entries
+    159.84    5.72%   decodeName
+    108.00    3.86%   parseName
+    102.60    3.67%   parseRawNumber
+     88.56    3.17%   parseRawInt
+     72.90    2.61%   PDFName.of
+     71.28    2.55%   parseDict
+     ... pako rows absent from the table ...
+```
+
+Two structural changes worth calling out:
+
+- All pako frames dropped out of the top 20. `writeSync` at 12.48 %
+  is libuv's syscall wrapper waiting on zlib's C++ work; that work
+  doesn't itself show in the JS-frame bottom-up because it runs off
+  the JS thread. The ~349 ms here is the total wait time across all
+  ~4,500 calls.
+- `(garbage collector)` dropped from 645 ms to 262 ms (-383 ms).
+  That matches the per-call allocator pressure from creating a fresh
+  `Deflate` instance + `DeflateState` per pako call, now gone.
+
+End-to-end `book.bat` run with the shim:
+
+```
+render:   8.5s   (1651 pages)
+generate: 37.1s  (raw 39.3 MB)
+process:  2.5s
+saved:    docs\_pdf\book.pdf  (16.1 MB)
+total:    50.1s
+```
+
+Process is now under three seconds on the production path. Wall-clock
+total ~50 s vs the prior ~70 s baseline. Output PDF byte size
+unchanged from the pre-shim build (16.1 MB; standard `/CreationDate`
+drift between runs).
+
+## After the shim: what's left
+
+After the shim the bottom-up profile points at the next two
+JS-attributable buckets:
+
+- `PDFRef.of` at 336 ms self-time (12 %). The function builds a
+  string key `<num> <gen> R` per call and Map-looks it up; the
+  string allocation per call is the cost. A drop-in fix would
+  replace the `Map<string>` pool with a flat array for the gen=0
+  case and a fallback Map for gen ≠ 0. Followed up below.
+- `(garbage collector)` at 262 ms (9 %). Tied to `PDFRef.of` and
+  the per-object dict allocations in the writer; expected to
+  shrink along with the first item.
+
+## `PDFRef.of`: dense-array cache for the gen=0 path
+
+The upstream implementation:
+
+```js
+var pool = new Map();
+PDFRef.of = function (objectNumber, generationNumber) {
+    if (generationNumber === void 0) { generationNumber = 0; }
+    var tag = objectNumber + " " + generationNumber + " R";   // alloc
+    var instance = pool.get(tag);                              // hash
+    if (!instance) {
+        instance = new PDFRef(ENFORCER, objectNumber, generationNumber);
+        pool.set(tag, instance);
+    }
+    return instance;
+};
+```
+
+Per call: build a fresh `<obj> <gen> R` string, hand it to a
+`Map<string>` lookup that has to hash it, branch on miss. The
+string allocation is the cost we care about -- the dedup pool
+itself works correctly, it's just paying for its key on every read.
+
+### Workload shape
+
+Instrumented `PDFRef.of` and re-ran the harness through load + save:
+
+```
+total PDFRef.of calls     : 1,231,643
+  gen=0 (or undefined)    : 1,010,034  (82 %)
+  gen != 0                :   221,608  (18 %)
+gen=N value distribution (top, 4523 calls each):
+  gen=1, gen=2, ... gen=50: 4523 calls/value
+```
+
+The 1.2 M gen=0 calls are what the parser does for every
+encountered `N 0 R` reference and every per-object PDFRef
+construction. The 221 k gen != 0 calls are pdf-lib's xref-stream
+bookkeeping for PDF 1.5+ compressed-object entries: in a
+cross-reference stream's type-2 entry, the spec uses the
+"generation number" field to store the **index of the object
+within its ObjStm**, and pdf-lib feeds that index straight to
+`PDFRef.of` (`PDFXRefStreamParser.js:74-80`). 4,523 ObjStms × 50
+entries each ≈ the observed 221 k.
+
+So 82 % of calls have generationNumber=0. That's the path worth
+optimising.
+
+### The shim
+
+`docs/lib/fast-refs.mjs` is the symmetric side-effecting import to
+`fast-deflate`:
+
+```js
+import { PDFRef } from "pdf-lib";
+
+if (!PDFRef.__fastPoolInstalled) {
+  const original = PDFRef.of;
+  const pool0 = [];
+  PDFRef.of = function fastOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      const fresh = original.call(PDFRef, objectNumber, 0);
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    return original.call(PDFRef, objectNumber, generationNumber);
+  };
+  PDFRef.__fastPoolInstalled = true;
+}
+```
+
+Dense-array indexed by `objectNumber` for the gen=0 case -- no
+string alloc, no Map hash, just an array read. gen != 0 passes
+through to the original (which still allocates the tag and runs
+the Map lookup, but that's only 18 % of calls).
+
+The cache is **in front of** the original `PDFRef.of`, not a
+replacement: on a miss we call the original to produce the PDFRef
+instance, then cache it. That dodges the module-private `ENFORCER`
+token the upstream constructor demands. Memory cost is a second
+reference per PDFRef on top of the upstream pool's entry -- ~228 k
+tiny objects, negligible.
+
+The interning contract is preserved: `PDFRef.of(42) === PDFRef.of(42, 0)`
+and both `!== PDFRef.of(42, 1)`, as before.
+
+### Results: profiler-on vs profiler-off matters
+
+First A/B with the process-phase profiler attached (paired,
+`--detach-pages --no-timing --cpu-profile-process --cpu-sampling 100
+--fast-deflate [--fast-refs]`):
+
+| metric    | pre (no fast-refs) | post (+ fast-refs) | Δ |
+| ---       | ---                | ---                | --- |
+| process   | 2.94 s             | 2.52 s             | **-0.42 s (-14 %)** |
+| ↳ load    | 1.81 s             | 1.42 s             | -0.39 s |
+| ↳ save    | 1.12 s             | 1.08 s             | flat |
+| `PDFRef.of` self in profile | 336 ms (12 %) | 148 ms (5.9 %) | -188 ms |
+| `(garbage collector)` self  | 262 ms (9 %) | 194 ms (7.8 %) | -68 ms |
+
+`PDFRef.of`'s self-time roughly halved, GC pressure dropped, and
+the wall-clock saving (390 ms on load) looked like a clean win.
+
+But: paired A/B *without* the profiler attached told a different
+story:
+
+| metric    | pre (no fast-refs) | post (+ fast-refs) | Δ |
+| ---       | ---                | ---                | --- |
+| process   | 2.48 s             | 2.26 s             | **-0.22 s (-9 %)** |
+| ↳ load    | 1.51 s             | 1.27 s             | **-0.24 s (-16 %)** |
+| ↳ save    | 0.96 s             | 0.98 s             | flat |
+
+**Real wall-clock saving is ~240 ms**, not 390 ms. The remaining
+~150 ms of the profiler-on delta was profiler-attribution overhead
+that our shim removed by making the hot function shorter -- fewer
+samples landing on `PDFRef.of`, less per-sample tax. The profiler
+isn't lying about which function is expensive; it's overstating
+*how much* that expense will move wall-clock once you fix it.
+
+The diagnostic question to tell these apart: *what's the call
+rate?* At 1.2 M calls per load, even a few microseconds of
+sampling overhead per call adds up to hundreds of milliseconds in
+the profile. Functions called millions of times need a no-profile
+A/B as a sanity check before claiming the wall-clock saving the
+profile implied. Functions called a few times per page (or once
+per render) don't.
+
+Both numbers are real -- the bottom-up profile is the right
+*target* for "what's worth fixing," but a no-profile A/B is the
+right *measurement* for "how big the win was."
+
+### Production confirmation
+
+`book.bat` with both shims, two consecutive runs:
+
+```
+render:   9.1s   (1651 pages)
+generate: 37.5s
+process:  2.3s
+saved:    docs\_pdf\book.pdf  (16.1 MB)
+total:    50.7s
+```
+
+Process dropped from the prior 2.5 s (with just `fast-deflate`) to
+2.3 s. `book.bat` rounds to 0.1 s and is single-run so individual
+phase numbers carry some run-to-run jitter, but the harness's
+2.48 → 2.26 paired-A/B confirms the ~200 ms move is real.
+
+### What this didn't fix
+
+The post-`fast-refs` bottom-up table:
+
+```
+samples: 4668   duration: 2.53s   us/sample: 542
+
+   self_ms   self_%   function                   source
+   -------   ------   --------------------------------------------------
+    341.17   13.59%   writeSync                  (Node libuv -- zlib's C++ work)
+    194.41    7.75%   (garbage collector)
+    181.96    7.25%   PDFDict.entries            pdf-lib/.../PDFDict.js:22
+    172.21    6.86%   decodeName                 pdf-lib/.../PDFName.js:9
+    147.84    5.89%   PDFRef.of                  pdf-lib/.../PDFRef.js:34  (the 18 % gen != 0 residue)
+     96.40    3.84%   parseName
+     95.31    3.80%   parseRawNumber
+     78.52    3.13%   parseDict
+     ...
+```
+
+`PDFRef.of` is still on the list at 148 ms -- that's the 221 k
+gen != 0 calls still going through the upstream string-keyed Map.
+Optimising those would require either: (a) a 2D structure keyed by
+gen first then objectNumber, or (b) accepting that the in-ObjStm
+"index as generation" usage is short-lived bookkeeping (the parser
+creates these refs once to populate xref tables, then mostly
+re-resolves the actual `N 0 R` form). Neither moves the wall-clock
+total enough to justify -- 150 ms of a 50 s build is the noise floor.
+
+Above `PDFRef.of`, the load-phase costs (`decodeName`, `parseName`,
+`parseRawNumber`, `parseDict`, etc.) are pdf-lib's actual parser
+work. Those are O(input size) and pretty close to fundamental --
+shrinking them would mean rewriting the parser.
+
+### What's left on save
+
+After the fast-refs shim the process-phase profile's top
+self-time entry was still `writeSync` at ~340 ms / 12 %. The name
+is misleading -- not `fs.writeFileSync` writing the output PDF,
+but `node:zlib`'s native binding inside `deflateSync`.
+`find-callers` attributes the chain:
+
+```
+writeSync                 344 ms   (zlib native)
+  processChunkSync        node:zlib:399
+  zlibBufferSync          node:zlib:165
+    PDFFlateStream.computeContents     186 ms   (pdf-lib stream compression)
+    fastDeflate (our shim)             130 ms
+    syncBufferWrapper                   34 ms
+```
+
+So the cost is pure CPU-bound deflate during `pdfDoc.save()`. The
+streams being compressed: pdf-lib's `PDFStreamWriter` (the default
+when `useObjectStreams: true`) groups every non-stream,
+non-encrypted, gen=0 indirect object into `PDFObjectStream` chunks
+of 50, deflates each, and writes the result. On the book that's
+~4,500 chunks, each a small deflate job, all running serially on
+the main thread.
+
+## Parallelising save's deflate on libuv's pool
+
+### Why not just async-deflate inline
+
+pdf-lib's serializer is synchronous at the relevant call sites:
+`PDFFlateStream.computeContents`
+(`pdf-lib/.../structures/PDFFlateStream.js:13`) is a closure that
+returns `pako.deflate(unencodedContents)` inline, called from
+`cache.access()` during `sizeInBytes()`. Swapping `deflateSync` →
+async `deflate` would mean rewriting the whole save path to await
+every stream. The call sites don't expect a promise.
+
+### Why not `useObjectStreams: false`
+
+The one-liner that skips the whole problem. Measured on the book:
+
+| variant | save | process | PDF size |
+| --- | --- | --- | --- |
+| pdf-lib default (objectsPerStream=50, sync) | 1.01 s | 2.30 s | 16.1 MB |
+| `useObjectStreams: false`                   | 0.59 s | 2.17 s | **40.5 MB** |
+
+A 2.5x file-size regression. The whole point of pdf-lib's
+roundtrip over Chrome's raw output was to compress those streams.
+Not an option.
+
+### What actually worked: parallel pre-deflate + larger chunks
+
+`docs/lib/parallel-deflate.mjs` subclasses pdf-lib's
+`PDFStreamWriter` and splits its `computeBufferSize` into three
+phases:
+
+1. **Classify** indirect objects into uncompressed (streams,
+   encrypt, gen != 0) vs compressed chunks of N. Same logic as
+   upstream, no behaviour change.
+2. **Instantiate all `PDFObjectStream`s up-front**, snapshot their
+   unencoded contents, then `await Promise.all` an async
+   `zlib.deflate` per stream. Libuv's thread pool (default 4) runs
+   them concurrently. Write each result into the stream's
+   `contentsCache.value`.
+3. **Size + emit** -- same as upstream, but every `cache.access()`
+   is a hit, so save's loop never touches deflate.
+
+The xrefStream is one more `PDFFlateStream` but its contents
+depend on the offsets computed in phase 3, so we pre-deflate it
+via `node:zlib.deflateSync` right after those offsets are pinned
+-- one stream, sync is fine, and pre-populating its cache means
+`computeIndirectObjectSize` later is a hit too. The net effect:
+every deflate that happens during a save goes through `node:zlib`,
+and pdf-lib's pure-JS fallback never runs.
+
+Exposed as `parallelSave(pdfDoc, options)`. Drop-in for
+`pdfDoc.save` when `useObjectStreams: true` -- same pre-serialize
+hooks (addDefaultPage, updateFieldAppearances, flush),
+byte-near-equivalent output (zlib's LZ77 match choices may differ
+from pdf-lib's default deflate library at the byte level, but the
+wire format is identical).
+
+### First try with default `objectsPerStream=50` was slower
+
+Profile diff (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| metric | serial (default) | parallel @ 50 (4,523 streams) | Δ |
+| --- | --- | --- | --- |
+| `writeSync` self  | 345 ms | 79 ms | **-266 ms** |
+| `write` (native, libuv setup) | <1 ms | 118 ms | **+117 ms** |
+| `close` (native, libuv teardown) | <1 ms | 96 ms | **+95 ms** |
+| net main-thread zlib + libuv overhead | 346 ms | 293 ms | -53 ms |
+
+The actual deflate work did move off-thread (`writeSync` dropped
+sharply), but libuv's per-`uv_work_t` dispatch overhead on 4,523
+tiny jobs ate most of the savings. ~50 µs/job × ~4,500 jobs ≈
+225 ms of pure dispatch.
+
+### Fix: bigger chunks via `objectsPerStream: 500`
+
+Ten-fold-larger object streams cut the chunk count from ~4,500 to
+~450. Same total deflate work, but in ~450 jobs instead of ~4,500
+-- libuv overhead drops by ~10x. Side benefit: larger chunks share
+a deflate window, so the output PDF is ~5 % smaller (16.1 MB →
+15.3 MB).
+
+Profile diff at `objectsPerStream: 500`
+(paired `--cpu-profile-process --cpu-sampling 100`):
+
+| metric                                          | serial @ 500 | parallel @ 500 | Δ |
+| ---                                             | ---          | ---            | --- |
+| `writeSync` self (zlib native, main thread)     | 335 ms       | 33 ms          | **-302 ms** |
+| `close` (libuv finalize)                        | 1.7 ms       | 15 ms          | +13 ms |
+| `PDFFlateStream.computeContents`                | 20 ms        | 4 ms           | -16 ms |
+| **total zlib-related main-thread self-time**    | **360 ms**   | **54 ms**      | **-306 ms (-85 %)** |
+| bottom-up: `writeSync` position                 | #1 (8.25 %)  | not in top 12  | gone |
+
+The 306 ms moved off the main thread to libuv's pool, where Node's
+V8 profiler doesn't sample it -- the headline "writeSync gone from
+the top 12" is the on-CPU-budget that save() pays.
+
+### Wall-clock note
+
+This whole sub-investigation deliberately compared profiles only,
+not wall-clock. The dev machine was busy with other work, and
+process is a ~2 s phase whose run-to-run jitter on a loaded system
+exceeds the expected delta. The profile diff cuts through that:
+306 ms of native zlib disappearing from the main-thread budget is
+a structural change that's stable across noise. A clean-machine
+wall-clock A/B would close the loop, but the optimisation is
+shippable on profile evidence alone.
+
+### Wired into production
+
+`render-book.mjs` swaps
+`pdfDoc.save({ objectsPerTick: Infinity })` for
+`parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 })`.
+Smoke test on the book:
+
+```
+render:   8.6s  (1651 pages)
+generate: 39.2s  (raw 39.3 MB)
+process:  2.2s
+saved:    docs\_pdf\book.pdf  (15.3 MB)
+total:    51.9s
+```
+
+The 15.3 MB output (down from 16.1 MB) is the chunk-size effect;
+the parallel deflate doesn't change byte size, only where the work
+runs.
+
+The harness exposes the same via `--parallel-deflate` (which calls
+`parallelSave` with the same defaults).
+
+### Retiring `fast-deflate.mjs`
+
+Once `parallelSave` also pre-deflates the xrefStream, pdf-lib's
+lazy `cache.populate()` deflate path is **never invoked at
+runtime**. Every `PDFObjectStream` is parallel-deflated in phase 2;
+the xrefStream is sync-deflated in phase 3. Both go through
+`node:zlib`. There's no remaining call site for pdf-lib's pure-JS
+fallback during a save.
+
+The `fast-deflate.mjs` shim that used to monkey-patch
+`pako.deflate` is therefore redundant -- it was a per-call dispatch
+optimisation for a code path we no longer take. Deleted:
+
+- `docs/lib/fast-deflate.mjs` -- removed.
+- `import './lib/fast-deflate.mjs'` -- removed from
+  `render-book.mjs`.
+- `--fast-deflate` -- removed from the `measure.mjs` flag set.
+
+Smoke profile after removal (`--parallel-deflate --fast-refs
+--cpu-profile-process`, no fast-deflate import anywhere): 0 frames
+matching `pako`, 0 matches for `computeContents`, 0 for
+`fastDeflate`. Process phase 2.34 s, output 15.3 MB.
+
+The deletion is purely a cleanup -- profile-equivalent to before
+-- but it removes 38 lines of indirection and one transitive
+concern.
+
+### Routing inflate through `node:zlib` too
+
+One call site on the load side still went through pdf-lib's pako:
+`PDFCrossRefStreamParser` decompresses the xref stream's payload
+via `pako.inflate` during `PDFDocument.load`. Cost is tiny -- one
+inflate per load, ~3 ms -- but it's the last pdf-lib → pako edge
+in the runtime, and the dispatch story for the README is cleaner
+when "every zlib call goes through `node:zlib`" is true on both
+sides.
+
+`docs/lib/fast-inflate.mjs` is the symmetric counterpart to the
+retired `fast-deflate.mjs`:
+
+```js
+import { inflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastInflateInstalled) {
+  const original = pako.inflate;
+  pako.inflate = function fastInflate(data, options) {
+    if (options) return original.call(pako, data, options);
+    return inflateSync(data);
+  };
+  pako.__fastInflateInstalled = true;
+}
+```
+
+`render-book.mjs` imports it unconditionally next to `fast-refs`.
+No harness flag -- the per-load cost is below the profile noise
+floor; this lands for the architectural reason, not a measurable
+win.
+
+## `BaseParser.parseRawNumber` + `parseRawInt`: direct-integer accumulators
+
+After `fast-deflate` + `fast-refs` + `parallel-deflate`, the load
+side of the bottom-up table shifted onto the parser frames. Two of
+them are `BaseParser.parseRawNumber` (called once per numeric
+token, twice per `N gen R` indirect reference) and
+`BaseParser.parseRawInt` (called twice per indirect-object header
+and twice per object inside an `ObjStm`). Between them they fire
+hundreds of thousands of times per load on the book.
+
+The upstream implementation
+(`pdf-lib/.../parser/BaseParser.js:33`) builds the number as a
+string, one character at a time, then converts:
+
+```js
+let value = '';
+while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
+  value += charFromCode(this.bytes.next());
+}
+// ... fractional part, sign handling ...
+const numberValue = Number(value);
+if (!isFinite(numberValue) || numberValue > Number.MAX_SAFE_INTEGER) { ... }
+return numberValue;
+```
+
+Every call allocates a throwaway string of length 1..N (one `+=`
+allocation per digit), then runs `Number(value)` to parse the
+string back into a double, then runs guards. The string allocation
++ `Number()` round-trip is the cost we care about.
+
+### The shim
+
+`docs/lib/fast-parse-number.mjs` mutates both
+`BaseParser.prototype.parseRawNumber` and
+`BaseParser.prototype.parseRawInt` to accumulate the integer
+directly (`n = n * 10 + (byte - 0x30)`). The number variant
+additionally descends into decimal handling when a period appears.
+Both fall back to the original for:
+
+- **More than 15 integer digits** -- direct accumulation could
+  exceed `Number.MAX_SAFE_INTEGER` (16 digits) and silently lose
+  precision. Upstream's `Number(value)` retains correctly-rounded
+  double precision in that range and emits the spec-mandated
+  overflow warning, so we rewind and delegate.
+- **Empty-digit cases** (e.g. `+`, `.`, bare sign) -- rewind and
+  let upstream throw `NumberParsingError` with full diagnostic
+  context. Both fallback paths are vanishingly rare on real PDFs.
+
+`BaseParser` isn't re-exported by pdf-lib's index, so we reach it
+via the package's CJS internal path through `createRequire`:
+
+```js
+const require = createRequire(import.meta.url);
+const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default;
+```
+
+Mutating `BaseParser.prototype` propagates to every subclass --
+`PDFParser`, `PDFObjectParser`, `PDFObjectStreamParser`,
+`PDFXRefStreamParser`. One side-effecting import covers them all.
+
+`render-book.mjs` imports it unconditionally next to `fast-refs`.
+No harness flag yet; the win is small per-call but the call rate
+is high enough to matter -- to be measured later when the
+follow-on work (size-in-bytes / iterator / parseDict shims) makes
+the parser side worth quantifying as a group.
+
+## `decodeName`: skip the regex on the 99.999 % no-`#` path
+
+The earlier closing summary above wrote off `decodeName` as "close
+to fundamental pdf-lib work." Re-reading the function on a later
+pass disproved that.
+
+`pdf-lib/.../objects/PDFName.js:9`:
+
+```js
+var decodeName = function (name) {
+    return name.replace(/#([\dABCDEF]{2})/g, function (_, hex) {
+        return utils_1.charFromHexCode(hex);
+    });
+};
+```
+
+PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any
+byte outside the printable-ASCII regular range plus delimiters /
+whitespace. `decodeName` reverses that on every `PDFName.of(name)`
+call so the pool key is the canonical decoded form, dedup'ing
+`/foo#20bar` and `/foo bar` to the same instance.
+
+The catch: the regex has to scan every byte of every name looking
+for `#`, even when there is none.
+
+### Workload shape
+
+Instrumented `PDFName.of` on the book, counting calls and how
+often the input contains a `#`:
+
+```
+PDFName.of calls       : 2,759,635
+  raw input has # char : 2 (0.000%)
+```
+
+Two. In 2.76 million calls. The other 2,759,633 are regex scans
+against strings like `Type`, `S`, `P`, `Pg`, `StructElem`, `Kids`,
+`Count`, `Filter`, `FlateDecode` -- ordinary PDF names that need
+no escaping. We measured ~214 ms (7 %) of process self-time on
+`decodeName` and another ~91 ms on `PDFName.of`'s body that calls
+it.
+
+### The shim
+
+`docs/lib/fast-decode-name.mjs` follows the `fast-refs.mjs` shape:
+cache in front of `PDFName.of` rather than replacing it. The key
+insight is that when `name` has no `#`, the decoded form equals
+the raw form, so the raw `name` is already a valid pool key for
+pdf-lib's internal dedup pool -- a fast-side `Map<string, PDFName>`
+keyed by the raw input returns the same `PDFName` instance pdf-lib
+would have produced after a regex scan + pool lookup, without ever
+running the regex.
+
+```js
+import { PDFName } from "pdf-lib";
+
+if (!PDFName.__fastDecodeNameInstalled) {
+  const original = PDFName.of;
+  const fastCache = new Map();
+  PDFName.of = function fastOf(name) {
+    if (name.indexOf("#") === -1) {
+      const cached = fastCache.get(name);
+      if (cached) return cached;
+      const instance = original.call(PDFName, name);
+      fastCache.set(name, instance);
+      return instance;
+    }
+    return original.call(PDFName, name);
+  };
+  PDFName.__fastDecodeNameInstalled = true;
+}
+```
+
+Names with `#` fall through to the original -- the dual canonical-
+form contract is preserved exactly. Static `PDFName.Length`,
+`PDFName.FlateDecode`, ... initialisers ran when pdf-lib's module
+body executed (before the shim imports), so pdf-lib's pool is
+already populated with the canonical instances; the parser then
+hits the fast cache on every subsequent reference.
+
+### Results
+
+Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`),
+`--detach-pages --no-timing --fast-refs --parallel-deflate
+--cpu-profile-process --cpu-sampling 100`, same 1651-page book:
+
+| metric        | pre avg | post avg | Δ                |
+| ------------- | ------- | -------- | ---------------- |
+| **process**   | **2.74 s** | **2.21 s** | **-0.53 s (-19 %)** |
+| ↳ load        | 1.69 s  | 1.40 s   | -0.29 s (-17 %) |
+| ↳ setOutline  | 0.01 s  | 0.01 s   | unchanged |
+| ↳ save        | 1.04 s  | 0.81 s   | -0.23 s (-22 %) |
+| pdf size      | 16.1 MB | 16.1 MB  | byte-identical pairwise (pre1↔post1, pre2↔post2; 31 B intra-pair drift is `/CreationDate`) |
+
+The load drop is what the instrumentation predicted. The save drop
+was a surprise -- save doesn't call `PDFName.of` to build outline
+metadata in the hot path, so the saving is almost certainly GC
+pressure relief from no longer allocating ~2.76 M regex-match
+objects during load.
+
+Profile diff (single run each, same flags):
+
+| function | PRE | POST | Δ |
+| --- | --- | --- | --- |
+| `decodeName`             | 214 ms (7.4 %) | not in top 15 | **-214 ms** |
+| `PDFName.of`             |  91 ms (3.1 %) | not in top 15 | **-91 ms** |
+| `fastOf` (the shim body) | n/a            |  91 ms (4.1 %) | +91 ms |
+| `(garbage collector)`    | 339 ms (11.7 %) | 238 ms (10.8 %) | -101 ms |
+| profile duration         | 2.92 s         | 2.22 s | -0.70 s |
+
+The `fastOf` row sits at the same self-time as the old
+`PDFName.of` forwarder (~91 ms) -- that's the per-call cost of the
+`indexOf` check + `fastCache.get` + return, which all calls now
+pay. The 214 ms `decodeName` row is gone entirely (regex never
+runs on the fast path), and the GC drop is the allocator relief.
+
+### Production confirmation
+
+Two consecutive `book.bat` runs with all four shims live
+(`fast-refs`, `fast-parse-number`, `parallel-deflate`,
+`fast-decode-name`):
+
+| metric | run 1 | run 2 |
+| --- | --- | --- |
+| render   | 8.9 s | 8.3 s |
+| generate | 39.3 s | 37.6 s |
+| process  | **1.6 s** | **1.6 s** |
+| total    | 51.8 s | 50.0 s |
+
+Process is now ~1.6 s on the production path, off the profiler.
+The harness numbers above are higher (~2.2 s post-fix) because of
+profiler-on attribution overhead at 100 us sampling -- the same
+caveat the `PDFRef.of` section flagged. The paired-A/B delta from
+the harness (-0.53 s) is the correct measure of the shim's win;
+the absolute 1.6 s is the production floor.
+
+### Methodology note
+
+This one almost didn't get found. The earlier "what's left" summary
+explicitly wrote `decodeName` off as "close to fundamental" parser
+work, on the strength of it living in a single regex line. The
+actual investigation took 30 seconds: read the function, ask
+"what's the hit rate of that regex on real PDF names?", instrument
+with a one-liner counter, find that the answer is 0.0001 %. Worth
+re-checking the "fundamental" label on remaining JS-body rows
+whenever a small change to the workload might invert it.
+
+## `numberToString`: skip the redundant toString/split on the 100 % no-`e` path
+
+`pdf-lib/.../utils/numbers.js:13` is pdf-lib's `.toString()`
+replacement that suppresses exponential notation -- PDF syntax
+requires plain decimal in the object body (`1e-7` is invalid), so
+every numeric token written into the file goes through:
+
+```js
+exports.numberToString = function (num) {
+    var numStr = String(num);
+    if (Math.abs(num) < 1.0) {
+        var e = parseInt(num.toString().split('e-')[1]);
+        if (e) { /* expand "1e-7" -> "0.0000001" */ }
+    } else {
+        var e = parseInt(num.toString().split('+')[1]);
+        if (e > 20) { /* expand "1e+21" -> "100...0" */ }
+    }
+    return numStr;
+};
+```
+
+`numStr` is computed up front via `String(num)`. Then -- regardless
+of whether `numStr` actually contains an `e` -- the function calls
+`num.toString()` *again*, allocates a `.split(...)` array, and
+runs `parseInt` on the (almost always undefined) result. Pure
+overhead on every call where `String(num)` already returned a
+plain decimal, which on a real PDF is every call.
+
+### Workload shape
+
+Instrumented `numberToString` on the book, counting fast-path
+(`String(num).indexOf('e') === -1`) vs slow-path hits:
+
+```
+numberToString calls : 290,231
+  String(num) has 'e' : 0 (0.000 %)
+```
+
+Zero. Of 290 k calls. `String(num)` returns exponential notation
+only when `|num| < 1e-6` or `|num| >= 1e21`, and a PDF's object
+refs, generations, byte offsets, content-stream coordinates,
+`/Size`, `/Length` etc. never land in either tail. The credit-card
+trick guarding the `e` cases is paid 290 k times to handle 0.
+
+### The shim
+
+`docs/lib/fast-number-to-string.mjs` short-circuits the no-`e`
+case and delegates the rare exponential cases to the original
+implementation unchanged:
+
+```js
+const fastNumberToString = function fastNumberToString(num) {
+  const numStr = String(num);
+  if (numStr.indexOf('e') === -1) return numStr;
+  return original(num);
+};
+numbers.numberToString     = fastNumberToString;
+utilsBarrel.numberToString = fastNumberToString;
+topBarrel.numberToString   = fastNumberToString;
+```
+
+### Wiring gotcha: tslib 1.x value-copy re-exports
+
+pdf-lib ships compiled against `tslib@1.14.1`, whose
+`__exportStar` is:
+
+```js
+function (m, exports) {
+    for (var p in m) if (p !== "default" && !exports.hasOwnProperty(p)) exports[p] = m[p];
+}
+```
+
+A plain value-copy. tslib 2.x replaced this with a live getter
+(`Object.defineProperty(o, p, { get: () => m[p] })`), so on modern
+compilations a single `numbers.numberToString = fast` patch would
+propagate through every re-export automatically. On 1.x it
+doesn't.
+
+`PDFNumber`'s call site -- the only consumer of `numberToString`
+in pdf-lib's source -- reads from the utils-barrel, not from
+`numbers.js` directly:
+
+```js
+// PDFNumber.js
+var index_1 = require("../../utils/index");
+...
+_this.stringValue = index_1.numberToString(value);   // <-- captured copy
+```
+
+Because `import { PDFDocument } from 'pdf-lib'` runs *before* the
+shim's dynamic import, the barrel has already executed
+`__exportStar(numbersModule, exports)` and stamped its own copy of
+the original function. Mutating `numbers.numberToString`
+afterwards is invisible to `PDFNumber`. The first iteration of
+this shim looked installed (the standalone test showed the patched
+function on the barrel, because that test imported the barrel
+*after* the shim) but the harness counter recorded 0 hits on the
+patched body -- the upstream function was still hot in the profile
+under its original name.
+
+Fix: patch every re-export in the chain that captures by value:
+`utils/numbers` (the source), `utils/index` (the barrel
+`PDFNumber` reads from), and `cjs/index` (pdf-lib's top-level,
+which `__exportStar`s the utils barrel onward to anyone importing
+from `'pdf-lib'`). All three get the same `fastNumberToString`
+reference.
+
+The `fast-decode-name` / `fast-refs` / `fast-parse-number` shims
+don't hit this trap because their targets are class-static methods
+(`PDFName.of`, `PDFRef.of`) or `BaseParser.prototype` methods --
+all looked up at call time via the class/prototype object, not via
+a captured value. `numberToString` is the first free function
+we've patched in pdf-lib.
+
+### Results
+
+Paired A/B, two interleaved runs each (`pre1 post1 pre2 post2`),
+`--detach-pages --no-timing --fast-refs --parallel-deflate
+--fast-decode-name --cpu-profile-process --cpu-sampling 100`,
+same 1638-page book:
+
+| metric                                  | pre1   | pre2   | post1  | post2  |
+| ---                                     | ---    | ---    | ---    | ---    |
+| upstream `numberToString` self-time     | 45 ms  | 51 ms  | 0 ms   | 0 ms   |
+| shim `fastNumberToString` self-time     | n/a    | n/a    | 5 ms   | 12 ms  |
+| **combined self-time on this function** | **45 ms** | **51 ms** | **5 ms** | **12 ms** |
+| slow-path delegations to original       | n/a    | n/a    | 0      | 0      |
+
+The `String(num).indexOf('e') === -1` short-circuit fires on 100 %
+of calls; the upstream function is unreachable in practice.
+Function-level self-time drops by ~80 % (~40 ms saved on the hot
+function), the redundant `num.toString()` + `.split(...)` +
+`parseInt(...)` work gone from the trace.
+
+Wall-clock process-phase numbers on this dev machine bounce around
+enough run-to-run (~±0.15 s) that the ~40 ms function-level saving
+is invisible at the phase total -- both pre and post sit near
+2.05 s. The profile-level evidence is the real signal: the cycles
+were redundant, they're not being spent any more.
+
+### Methodology note
+
+The first cut of this shim mutated `numbers.numberToString` only,
+following the assumption that pdf-lib's re-exports would propagate
+the change. The hit counter (`fast=0 slow=0` on a full book run)
+caught the mistake before the README evidence was written -- a
+shim that *looks* installed but never actually runs would have
+shown identical "before" and "after" profile numbers within noise,
+indistinguishable from a no-op patch.
+
+Lesson for the next pdf-lib shim of a free function (rather than a
+class method): check `tslib.__exportStar`'s shape before assuming
+a single-site patch works.
+
+## `sizeInBytes`: stop allocating a base-2 string just to count its bits
+
+A fresh process-phase profile under the post-`fast-decode-name` /
+`fast-number-to-string` shipping set (1638-page book, `--fast-refs
+--parallel-deflate --fast-decode-name --fast-number-to-string
+--cpu-profile-process --cpu-sampling 100`) put process at 1.95 s
+and showed an oddly-shaped row in the top-15:
+
+```
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+    213.02   10.97%   (garbage collector)
+    171.60    8.83%   PDFDict.entries          pdf-lib/PDFDict.js:22
+    144.16    7.42%   PDFRef.of                pdf-lib/PDFRef.js:34
+    ...
+     56.48    2.91%   exports.sizeInBytes      pdf-lib/utils/numbers.js:37
+```
+
+`sizeInBytes` is a four-line utility:
+
+```js
+exports.sizeInBytes = function (n) { return Math.ceil(n.toString(2).length / 8); };
+```
+
+It computes how many bytes a non-negative integer takes by
+stringifying it as base-2, counting characters, and dividing by 8.
+The string is thrown away immediately.
+
+`find-callers.mjs` attributed the 56 ms across two callers, both
+inside the xref-stream writer:
+
+| caller | attributed |
+| --- | --- |
+| `bytesFor` (`utils/numbers.js:49`) -- sizes the `Uint8Array` that gets filled byte-by-byte | 29.6 ms |
+| `PDFCrossRefStream.computeMaxEntryByteWidths` (`structures/PDFCrossRefStream.js:66`) -- 3 calls per xref entry to compute the `/W` widths | 26.9 ms |
+
+For a ~50 k-object PDF that's roughly 300 k `n.toString(2)` calls
+per save, each allocating a short-lived 1-to-32-char string.
+Likely a contributor to the 213 ms GC at the top of the table too.
+
+### The shim
+
+`docs/lib/fast-size-in-bytes.mjs` replaces `utils.sizeInBytes`
+with a non-allocating short-circuit ladder:
+
+```js
+function fastSizeInBytes(n) {
+  if (n < 0x100) return 1;
+  if (n < 0x10000) return 2;
+  if (n < 0x1000000) return 3;
+  if (n < 0x100000000) return 4;
+  return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8);
+}
+```
+
+The ladder shape matches the actual value distribution in
+`computeEntryTuples`. The xref entry tuples are
+`(type, second, third)` where:
+
+- `type` is 0, 1, or 2 (1 byte, always)
+- `gen` / `index` are small (1-2 bytes)
+- `offset` for uncompressed entries reaches 3-4 bytes on a 16 MB
+  PDF
+- `nextFreeObjectNumber` for deleted entries is small
+
+So most calls take the very first branch. A `Math.clz32`-based
+alternative would be simpler but slower in the common case,
+because it always pays for the native call + sub + div + ceil.
+The ladder exits in one compare for the dominant case.
+
+Triple-patch shape mirrors `fast-number-to-string.mjs` -- pdf-lib
+ships compiled against tslib 1.x whose `__exportStar` value-copies
+re-exports rather than installing live getters, so consumers that
+read `sizeInBytes` through a barrel (`PDFCrossRefStream` does:
+`utils_1.sizeInBytes(...)`) hold a captured reference. Patch the
+source module, the utils/index barrel, and the top-level index to
+cover every observed call site. `utils.bytesFor` reads
+`exports.sizeInBytes` at call time from the same module object we
+mutate first, so it picks up the fast path without a separate
+patch.
+
+### Results
+
+A/B (2 runs each, `--fast-refs --parallel-deflate
+--fast-decode-name --fast-number-to-string --cpu-profile-process
+--cpu-sampling 100`, with `--fast-size-in-bytes` the only
+difference):
+
+| run | PRE | POST |
+| --- | --- | --- |
+| 1 | 1.95 s | 1.91 s |
+| 2 | 2.01 s | 1.93 s |
+| **avg** | **1.98 s** | **1.92 s** |
+| save sub-phase avg | 0.80 s | 0.73 s |
+
+**Δ = -60 ms process (-3.0 %).** The save sub-phase carries
+-70 ms of that -- exactly where `sizeInBytes` lives (xref writer
+fires during save, not load), so the attribution lines up.
+
+Profile self-time, POST run:
+
+- `exports.sizeInBytes` row: 56.48 ms → undetectable. V8 inlined
+  the ladder into both callers; `fastSizeInBytes` doesn't appear
+  in the profile by name either.
+- GC: 213 ms → 201 ms (-12 ms, consistent with no longer
+  allocating ~300 k short-lived base-2 strings per save).
+- No cost migration to other rows. The surrounding parser /
+  writer rows are flat within noise.
+
+PDF byte-equivalent (31-byte `/CreationDate` drift between PRE
+and POST -- well inside the standard timestamp band).
+
+### Side finding: the harness flag set wasn't tracking production
+
+While landing this change, the harness flag set was audited
+against `render-book.mjs`'s imports. `render-book.mjs` was
+importing five `fast-*` shims (`fast-refs`, `fast-inflate`,
+`fast-parse-number`, `fast-decode-name`, `fast-number-to-string`),
+but `measure.mjs` only exposed three of them as flags
+(`--fast-refs`, `--fast-decode-name`, `--fast-number-to-string`).
+So the canonical process-profile command was measuring a *subset*
+of what production actually runs -- two production shims
+(`fast-inflate` and `fast-parse-number`) had been on for
+production and silently off for the perf harness.
+
+Wall-clock impact of that gap is small in absolute terms (the two
+missing shims target the load sub-phase, which is ~1.2 s out of
+the 1.95 s process total), but the bottom-up table in the
+canonical command was attributing time to functions that don't
+run that way in production. Fixed in the same change: `measure.mjs`
+now exposes `--fast-inflate` and `--fast-parse-number`, and the
+canonical command in the README lists all five production shims
+plus `--fast-size-in-bytes`.
+
+The general lesson: when a new shim lands, audit the harness's
+flag set against `render-book.mjs`'s import list. A flag missing
+on the harness side silently moves the harness baseline away from
+production -- and the divergence accumulates over time.
+
+## `PDFDict.entries`: stop allocating a tuple array per save
+
+A profile of the process phase with every prior shipping shim
+applied still showed `PDFDict.entries` at the top of the non-GC
+self-time table, ~10 % of process. The function is a one-liner:
+
+```js
+PDFDict.prototype.entries = function () {
+    return Array.from(this.dict.entries());
+};
+```
+
+Per call: one `MapIterator` + one outer Array + one fresh
+`[key, value]` tuple per entry (allocated by the iterator itself,
+then collected by `Array.from` into the outer array). The save
+path fires both consumers on every dict -- `sizeInBytes` first to
+measure, then `copyBytesInto` to write -- so on the book that's
+~100 k `Array.from` calls feeding the GC. `(garbage collector)`
+sat at the top of the table too, which is the cost shape the
+allocation pattern predicts.
+
+Both consumers immediately destructure the tuples:
+
+```js
+var entries = this.entries();
+for (var idx = 0, len = entries.length; idx < len; idx++) {
+    var _a = entries[idx], key = _a[0], value = _a[1];
+    ...
+}
+```
+
+So nothing actually wants the array-of-tuples shape -- the
+upstream code uses it because that's what `entries()` returns,
+and the materialised array is dead by the next iteration.
+
+### The shim
+
+`docs/lib/fast-dict-iter.mjs` replaces
+`PDFDict.prototype.sizeInBytes` and
+`PDFDict.prototype.copyBytesInto` with versions that iterate the
+underlying Map in place via `Map.prototype.forEach((value, key),
+thisArg)`. The callback's positional `(value, key)` arguments
+mean no tuple is ever allocated, and routing per-call state
+through `forEach`'s `thisArg` instead of closure capture lets the
+callback stay a module-level function reference (no per-call
+closure context).
+
+The callbacks are hoisted to module top-level (not closures):
+
+```js
+function _sizeInBytesEntry(value, key) {
+  this.s += key.sizeInBytes() + value.sizeInBytes() + 2;
+}
+function _copyBytesIntoEntry(value, key) {
+  const buf = this.buf;
+  let off = this.off;
+  off += key.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Space;
+  off += value.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Newline;
+  this.off = off;
+}
+```
+
+Each consumer allocates a single small `ctx` object per call (one
+alloc, vs the prior `1 + N` Array allocations) and threads it
+through `thisArg`:
+
+```js
+PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+  // ... write '<<\n' ...
+  const ctx = { buf: buffer, off: offset };
+  this.dict.forEach(_copyBytesIntoEntry, ctx);
+  offset = ctx.off;
+  // ... write '>>' ...
+};
+```
+
+The `PDFDict.prototype.entries` method itself stays untouched --
+`clone()` and `toString()` still call it and rely on the
+array-of-tuples contract. Those paths fire rarely (clone on
+incremental updates, toString in debug output) and don't justify
+the contract churn.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                              | pre        | post       | Δ                  |
+| ---                                 | ---        | ---        | ---                |
+| `PDFDict.entries` self              | 164.16 ms  | off-list   | **-164 ms (-100 %)** |
+| `PDFDict.copyBytesInto` self        | 27.54 ms   | 25.42 ms   | flat               |
+| `_copyBytesIntoEntry` (callback)    | n/a        | 23.83 ms   | new                |
+| `PDFDict.sizeInBytes` self          | sub-cutoff | 15.89 ms   | n/a                |
+| `_sizeInBytesEntry` (callback)      | n/a        | 12.71 ms   | new                |
+| **dict-serialisation path subtotal**| **~192 ms (~11 % of process)** | **~78 ms (~5 % of process)** | **~80 ms / -6 pp** |
+| `(garbage collector)`               | 201 ms (12 %) | 227 ms (15 %) | +26 ms / +3 pp  |
+
+The 164 ms `entries` self-time is reliably gone. The replacement
+work in the four-row split (the two consumers + their named
+callbacks) sums to ~78 ms -- about a **6 pp drop** in process
+attribution to this code path.
+
+The `(garbage collector)` row going *up* was the surprise. A
+first-cut variant of the shim used closures (`forEach((value,
+key) => { ... captures `offset` ... })`) and showed the same GC
+increase. Hypothesis: the captured-and-mutated `offset` cell was
+forcing V8 to heap-allocate a closure context per call. So we
+tested the hoisted-callback variant above, which has zero
+closure capture. The GC row landed at almost exactly the same
+absolute value (~227 ms vs ~271 ms, both ~15 % of process).
+
+So the closure-capture hypothesis was wrong -- V8's escape
+analysis was already eliding the `offset` cell. The GC nudge is
+either run-to-run load-phase variance (the profile spans load +
+setOutline + save, and load dominates) or the per-call `ctx`
+object allocation we couldn't avoid without bigger code surgery.
+Either way it doesn't reverse the win: the dict-path attributable
+time dropped by ~80 ms, and that's real cycles removed.
+
+PDF output is byte-equivalent to the pre-shim build:
+`Map.forEach` iterates in insertion order, same as
+`Array.from(map.entries())`, so the serialised byte sequence is
+identical.
+
+### Lesson: hoist forEach callbacks when state is mutable
+
+The hoisted-callback pattern (callback = module-level function,
+state via `forEach`'s `thisArg`) reads as overkill -- a closure
+is fewer lines and easier to follow. Two reasons it's still the
+right shape here:
+
+1. **Profile attribution.** Named callbacks
+   (`_copyBytesIntoEntry`, `_sizeInBytesEntry`) appear in CPU
+   profiles under their names. Closures show up as
+   `(anonymous) @ file.mjs:55`, which makes future
+   profile-reading harder (you have to cross-reference the line
+   number every time).
+2. **Future-proofing against V8 changes.** Escape analysis can
+   handle the closure capture today, but the JIT's heuristics
+   shift across Node versions. The hoisted pattern is
+   semantically explicit -- no implicit allocation depends on
+   the compiler being smart. Same shape that has aged well in
+   other hot pdf-lib paths we've patched.
+
+Cost is negligible (six extra lines and two declarations);
+upside is the profile reads cleanly and the perf shape is robust
+to JIT changes. Worth doing whenever the callback's state
+outlives a single iteration.
+
+## `parseDict`: hoist the sentinel `PDFName`s out of the type-dispatch tail
+
+With every other process-phase shim in place, the top of the
+bottom-up table looked like:
+
+```
+   self_ms   self_%   function  @  source
+    194.12   12.49%   (garbage collector)
+    127.05    8.18%   PDFRef.of
+     86.70    5.58%   PDFObjectParser.parseName
+     80.70    5.19%   fastOf                       (fast-decode-name)
+     74.70    4.81%   PDFObjectParser.parseDict
+     ...
+```
+
+`fastOf` -- the cache in front of `PDFName.of` -- shouldn't be
+this high. The whole point of `fast-decode-name` is to collapse
+`PDFName.of` to a `Map.get` per call. So the question is why so
+many calls still hit it.
+
+Reading `PDFObjectParser.parseDict`
+(`pdf-lib/.../parser/PDFObjectParser.js:141`) shows the
+type-dispatch tail at the bottom:
+
+```js
+var Type = dict.get(PDFName.of('Type'));
+if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...);
+else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...);
+else if (Type === PDFName.of('Page'))  return PDFPageLeaf.fromMapWithContext(...);
+else                                   return PDFDict.fromMapWithContext(...);
+```
+
+Four `PDFName.of` calls per dict, **including** the dicts that
+have no `/Type` entry at all (resource dicts, font descriptors,
+content-stream dicts -- the bulk of what a real PDF contains).
+With `fast-decode-name` each call is a `fastCache.get` on a 4-byte
+string, which is cheap individually -- but on a 1638-page book
+that's tens of thousands of dicts × 4 calls = hundreds of
+thousands of cache lookups for the same handful of canonical
+`PDFName`s.
+
+### The shim
+
+`docs/lib/fast-parse-dict.mjs` replaces
+`PDFObjectParser.prototype.parseDict` with a version that
+captures the four sentinel `PDFName`s once at shim-load:
+
+```js
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+```
+
+and references them directly in the type-dispatch tail:
+
+```js
+const Type = dict.get(TypeName);
+if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context);
+if (Type === PagesName)   return PDFPageTree.fromMapWithContext(dict, this.context);
+if (Type === PageName)    return PDFPageLeaf.fromMapWithContext(dict, this.context);
+return PDFDict.fromMapWithContext(dict, this.context);
+```
+
+The rest of the function body (the `<< ... >>` parse loop, the
+`dict.set` calls, the whitespace skipping) is verbatim. Pool-dedup
+guarantees the captured `PDFName`s are `===` to whatever the
+parser would have built via the slow `PDFName.of` calls, so the
+dispatch identity comparisons work unchanged.
+
+`PDFObjectParser` isn't re-exported from pdf-lib's index, so the
+shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js`
+through `createRequire` -- same shape as `fast-parse-number.mjs`
+and `fast-dict-iter.mjs`.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                              | pre        | post       | Δ                  |
+| ---                                 | ---        | ---        | ---                |
+| `fastOf` self                       | 80.70 ms (5.19 %) | 63.20 ms (4.43 %) | **-17.5 ms (-22 %)** |
+| `parseDict` / `fastParseDict` self  | 74.70 ms (4.81 %) | 77.79 ms (5.45 %) | flat (noise)       |
+| process wall-clock                  | 1.55 s     | 1.42 s     | -0.13 s (~noise floor) |
+
+The cleanest signal is the `fastOf` drop: removing four
+`PDFName.of` calls per dict re-attributes ~17 ms away from the
+cache layer. `parseDict`'s own self-time is essentially unchanged
+because the four `PDFName.of` calls were already being charged to
+`fastOf`, not to `parseDict` (child frames don't roll into parent
+self-time). So the optimisation reads as "fastOf got cheaper"
+rather than "parseDict got faster," but it's the same removed
+work either way.
+
+The 130 ms wall-clock delta is mostly within run-to-run noise on a
+1.5 s phase. The mechanism-confirmed ~17 ms via profile
+attribution is the honest number.
+
+PDF output is byte-equivalent: same Map iteration order, same
+dispatch decisions, same canonical `PDFName` instances.
+
+### Why this is the bottom of the easy wins on parseDict
+
+`fastParseDict` is still in the top 15 (5.45 %), which suggests
+more juice in the function. The next-tier targets are all in the
+inner loop:
+
+- `!bytes.done() && bytes.peek() !== 0x3E && bytes.peekAhead(1) !== 0x3E`
+  -- three method calls per iteration, all reading the underlying
+  `Uint8Array`. Inlining would cut method-dispatch overhead but
+  requires reaching into `ByteStream`'s internals.
+- `dict.set(key, value)` -- Map entry allocation. Could be swapped
+  for a plain object via `Object.create(null)`, but
+  `PDFDict.fromMapWithContext` and the existing `fast-dict-iter`
+  shim both assume a Map, so it's a larger surgery.
+- `this.skipWhitespaceAndComments()` -- already on the top-15 list
+  in its own right (~32 ms / 2 %). Two-method-call body
+  (`skipWhitespace` + `skipComment` loop); inlining at parseDict's
+  call site would shed one method-dispatch per loop iteration.
+
+None of these are as clean as the sentinel-hoist patch, and each
+is a bigger code change for a smaller individual win. Worth
+revisiting if a future optimisation moves the floor and parseDict
+becomes a larger relative share.
+
+## `parseObject`: dispatch by first byte, gate the keyword scans
+
+After `fast-parse-dict` shipped, `PDFObjectParser.parseObject` was
+the next obvious row in the bottom-up table:
+
+```
+   self_ms   self_%   function  @  source
+    213.28   13.41%   (garbage collector)
+    113.05    7.11%   fastParseDict
+     99.12    6.23%   fastOf
+     86.87    5.46%   PDFRef.of
+     86.32    5.43%   PDFObjectParser.parseName
+     81.86    5.15%   PDFObjectParser.parseObject     <-- this row
+     ...
+```
+
+`parseObject` is the dispatch hub of the PDF object parser. It's
+called once per dict value, per array element, and per
+indirect-object body -- same call density as `fastParseDict` two
+rows above (every dict that fastParseDict builds calls parseObject
+N times for its N values).
+
+### What parseObject was doing
+
+The upstream body (`PDFObjectParser.js:36`):
+
+```js
+parseObject() {
+  this.skipWhitespaceAndComments();
+  if (this.matchKeyword(Keywords.true))  return PDFBool.True;
+  if (this.matchKeyword(Keywords.false)) return PDFBool.False;
+  if (this.matchKeyword(Keywords.null))  return PDFNull;
+  const byte = this.bytes.peek();
+  if (byte === LessThan && this.bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+  if (byte === LessThan)          return this.parseHexString();
+  if (byte === LeftParen)         return this.parseString();
+  if (byte === ForwardSlash)      return this.parseName();
+  if (byte === LeftSquareBracket) return this.parseArray();
+  if (IsNumeric[byte])            return this.parseNumberOrRef();
+  throw new PDFObjectParsingError(this.bytes.position(), byte);
+}
+```
+
+Three speculative `matchKeyword` calls run on every invocation,
+before the dispatch byte is ever peeked. `matchKeyword`
+(`BaseParser.js:97`) on a fast-fail mismatch does `bytes.offset()`,
+then `bytes.next()` on the first byte of the keyword, comparison,
+then `bytes.moveTo(initialOffset)` to restore. Three of those per
+`parseObject` call -- multiplied by the hundreds of thousands of
+calls per book load -- adds up.
+
+`true` / `false` / `null` are extraordinarily rare in real PDFs.
+The bulk of dict values are refs (`N N R`), numbers, names,
+sub-dicts, and arrays. Putting the dispatch-byte test *before*
+the keyword scans, and only entering `matchKeyword` when the
+first byte could plausibly start one of the three keywords,
+skips three method calls + a `moveTo` per `parseObject` on the
+overwhelming majority of inputs.
+
+### The shim
+
+`docs/lib/fast-parse-object.mjs` replaces
+`PDFObjectParser.prototype.parseObject` with:
+
+```js
+parseObject() {
+  this.skipWhitespaceAndComments();
+  const bytes = this.bytes;
+  const byte = bytes.peek();
+  if (IsNumeric[byte]) return this.parseNumberOrRef();
+  if (byte === LessThan) {
+    if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+    return this.parseHexString();
+  }
+  if (byte === ForwardSlash)      return this.parseName();
+  if (byte === LeftSquareBracket) return this.parseArray();
+  if (byte === LeftParen)         return this.parseString();
+  if (byte === t_code && this.matchKeyword(KwTrue))  return PDFBool.True;
+  if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False;
+  if (byte === n_code && this.matchKeyword(KwNull))  return PDFNull;
+  throw new PDFObjectParsingError(bytes.position(), byte);
+}
+```
+
+Three changes from upstream:
+
+1. Peek the first byte once, up front.
+2. Dispatch order reshuffled for dict-value frequency: numbers /
+   refs first (`IsNumeric[byte]` is a Uint8Array index, the
+   cheapest possible test), then `<<` / `<` (collapsed into one
+   `LessThan` branch with the `peekAhead` lookup inside), then
+   names, arrays, strings.
+3. The three keyword paths are gated -- `byte === t` / `f` / `n`
+   guards each `matchKeyword` call, so a non-keyword input never
+   pays for the speculative scan + rewind.
+
+Correctness: a value starting with `t`/`f`/`n` that isn't
+`true`/`false`/`null` falls through to the same
+`PDFObjectParsingError` the upstream code would throw. Dict keys
+can't reach parseObject (`parseDict` calls `parseName()` for
+keys, parseObject only for values), and names always start with
+`/`. Numbers can't start with letters. So the only valid values
+that hit the gated keyword branches are the three keywords
+themselves.
+
+`PDFObjectParser` isn't re-exported from pdf-lib's index, so the
+shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js`
+through `createRequire` -- same shape as `fast-parse-dict.mjs`.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                                  | pre        | post       | Δ                  |
+| ---                                     | ---        | ---        | ---                |
+| `parseObject` / `fastParseObject` self  | 81.86 ms (5.15 %) | 40.25 ms (3.07 %) | **-41.6 ms (-51 %)** |
+| `fastOf` self                           | 99.12 ms (6.23 %) | 64.18 ms (4.90 %) | -34.9 ms           |
+| `fastParseDict` self                    | 113.05 ms (7.11 %) | 65.26 ms (4.98 %) | -47.8 ms           |
+
+The targeted row roughly halves in self-time, as the model
+predicts (three `matchKeyword` calls collapsed to first-byte
+dispatch). The `fastOf` and `fastParseDict` drops aren't from
+this shim doing less work in those frames -- they're profile
+attribution shifting around once `parseObject` is no longer
+dominating its own children's sampling window (sampled duration
+fell from 1.58 s to 1.34 s overall).
+
+Wall-clock is too noisy on this machine to read at this scale --
+the mechanism-confirmed ~42 ms via profile attribution is the
+honest number.
+
+PDF output is byte-equivalent: same dispatch decisions, same
+fallthrough behaviour, same error shape.
+
+## Strip the parse-speed machinery: synchronify the load path
+
+After the eight `--fast-*` patches above had nibbled the process
+phase from 7.8 s down to 1.66 s, the next interesting thing in the
+profile wasn't *a function* -- it was *function scaffolding*.
+Three top-15 rows were the tslib `__awaiter` / `__generator`
+machinery that pdf-lib's TypeScript downlevel emits for its
+`async`-marked parser methods:
+
+```
+   self_ms   self_%   function                                  source
+   -------   ------   ----------------------------------------------
+     51.66    3.12%   (anonymous)  (parseIndirectObject body)   PDFParser.js:126
+     43.05    2.60%   step         (generator runner)           tslib.js:123
+     40.90    2.47%   (anonymous)  (parseIndirectObjects body)  PDFParser.js:190
+```
+
+Together ~135 ms / ~8 % of process self-time, sitting on top of
+the parsing work that's already attributed to the named frames
+below them.
+
+### What that scaffolding was for
+
+pdf-lib targets browsers as well as Node. On a browser, locking
+the main thread for the seconds it takes to parse a big PDF would
+freeze the page, so pdf-lib has a knob -- `parseSpeed`, also
+exposed as `objectsPerTick` -- that controls how many indirect
+objects the parser processes before yielding to the event loop via
+`await waitForTick()`. The default is the cautious
+`ParseSpeeds.Slow = 100`. The mechanism is a constructor-installed
+predicate (`PDFParser.js:31`):
+
+```js
+this.shouldWaitForTick = function () {
+  this.parsedObjects += 1;
+  return this.parsedObjects % this.objectsPerTick === 0;
+};
+```
+
+…queried at the bottom of every `parseIndirectObjects` iteration
+(`PDFParser.js:215`) and every `parseIntoContext` iteration in
+`PDFObjectStreamParser.js:42`, gating an `await waitForTick()`
+(= `setImmediate`).
+
+`render-book.mjs` already passed `parseSpeed: ParseSpeeds.Fastest`
+to `PDFDocument.load`, which is `objectsPerTick: Infinity`, which
+makes `shouldWaitForTick()` return `false` on every call: the
+modulo never hits zero, the yield never fires. The
+`Fastest`-vs-`Slow` speedup we'd measured years earlier (see
+[01-baseline-and-detach.md](01-baseline-and-detach.md))
+was precisely removing those yields' wall-clock contribution.
+
+But removing the *yields* didn't remove the **scaffolding**. Even
+with `objectsPerTick: Infinity`, every call to
+`parseIndirectObject` still:
+
+1. Allocates a Promise (the `__awaiter` return).
+2. Allocates a generator object (the inner `__generator` return).
+3. Allocates an activation record (the closed-over `_a` state).
+4. Enters the tslib `step` runner, which calls the generator
+   body, which enters `switch (_a.label) { case 0: ... }`, runs
+   all the synchronous work, falls through to `return [2 /*return*/, ref]`,
+   which `step` unpacks and resolves the Promise with.
+5. The caller `await`s that Promise (one microtask hop).
+
+For ~50 k indirect objects on the book that's 50 k of each.
+Roughly ~135 ms of attributed self-time (the three rows above)
+plus an unknowable but non-trivial fraction of the 240 ms GC row
+(Promise + generator + activation are all short-lived heap
+allocations).
+
+The same shape applies to `parseIndirectObjects` (which calls
+`parseIndirectObject`), `parseDocumentSection` (which calls
+`parseIndirectObjects`), `parseDocument` (which calls
+`parseDocumentSection`), and `PDFDocument.load` (which calls
+`parseDocument`). Five `async` wrappers around code that, on the
+hot path, runs synchronously.
+
+### Why bother on the ObjStm branch too
+
+`parseIndirectObject` *does* have one genuinely-await-ing branch
+at `PDFParser.js:142`: if the parsed object is an object stream
+(PDF 1.5 §7.5.7, type `ObjStm`), it dispatches to
+`PDFObjectStreamParser.parseIntoContext()`, which itself is
+`async`. But `parseIntoContext`'s only `await` is the same kind
+of conditionally-gated `waitForTick` -- and `shouldWaitForTick`
+is passed in from the parent parser, so it's still `() => false`
+under our config. The whole sub-stream walk is already morally
+synchronous; just no upstream code path ever constructs a parser
+without `shouldWaitForTick`.
+
+(Aside: Chrome's `SkPDF` writer doesn't emit ObjStm at all -- it
+writes every indirect object at its own xref offset and uses the
+classic xref table. So on our pipeline the ObjStm branch of
+`parseIndirectObject` doesn't even fire. But pdf-lib loads have
+to work generically; the patch handles the branch correctly.)
+
+### The shim
+
+`docs/lib/fast-sync-load.mjs` replaces six prototype methods with
+synchronous twins:
+
+```
+PDFParser.prototype.parseDocument
+PDFParser.prototype.parseDocumentSection
+PDFParser.prototype.parseIndirectObjects
+PDFParser.prototype.parseIndirectObject
+PDFObjectStreamParser.prototype.parseIntoContext
+PDFDocument.load   (static)
+```
+
+The bodies are line-by-line ports of the upstream `case`-blocks --
+same loop, same `parseObject` / `context.assign` / `parseHeader` /
+`maybeParseCrossRefSection` / `maybeParseTrailerDict` /
+`maybeParseTrailer` / `skipJibberish` calls in the same order --
+with three changes:
+
+1. No `__awaiter` / `__generator` wrapper. The function returns
+   directly.
+2. No `shouldWaitForTick` check, no `waitForTick` yield.
+3. The three `PDFName.of(...)` calls in `parseIndirectObject`'s
+   type-dispatch tail (`'Type'`, `'ObjStm'`, `'XRef'`) are hoisted
+   to module-level constants -- same trick as
+   [`fast-parse-dict.mjs`](#parsedict-hoist-the-sentinel-pdfnames-out-of-the-type-dispatch-tail),
+   since pool-dedup makes the `PDFName` instances reference-stable.
+
+The patches have to land together: each method awaits the next
+one down, so desugaring any one in isolation still leaves a
+Promise chain dangling.
+
+`PDFDocument.load`'s signature is preserved -- still callable as
+`await PDFDocument.load(bytes)`. `await` on a non-Promise resolves
+to the value immediately, so existing call sites need no change.
+The `parseSpeed` option is now silently ignored (no yield gate
+left to tune).
+
+The shim's correctness depends on the upstream pdf-lib source
+being structurally what the line-by-line port assumed. `pdf-lib`
+1.17.1 (Hopding's last release, abandoned) is byte-stable on npm
+and that's what we ship against; `package.json` is updated in
+this change to pin to `1.17.1` exact (was `^1.17.1`), similarly
+for `puppeteer` `25.0.4`, so a stray `npm update` can't silently
+swap upstream from under the shim.
+
+### Results
+
+Paired process-phase profiles, same harness config except
+`--fast-sync-load`:
+
+| metric                                  | PRE       | POST      | Δ                |
+| ---                                     | ---       | ---       | ---              |
+| **process wall-clock**                  | **1.66 s** | **1.30 s** | **-0.36 s (-22 %)** |
+| ↳ load                                  | 1.09 s    | 0.81 s    | -0.28 s (-26 %)  |
+| ↳ save                                  | 0.56 s    | 0.48 s    | -0.08 s (noise; writer not touched) |
+| GC self-time                            | 240 ms    | 187 ms    | -53 ms (-22 %)   |
+| `(anonymous) @ PDFParser.js:126`        | 51.66 ms  | gone      | -51.66 ms        |
+| `step @ tslib.js:123`                   | 43.05 ms  | gone      | -43.05 ms        |
+| `(anonymous) @ PDFParser.js:190`        | 40.90 ms  | gone      | -40.90 ms        |
+| **scaffolding total**                   | **~135 ms** | **0**   | **-135 ms (eliminated)** |
+
+The wall-clock delta is larger than the sum of the eliminated
+rows because the GC win is real time too: the per-object Promise
++ generator + activation allocations weren't free in V8's
+internals either, just not attributed to any named frame.
+
+Output PDF: byte-count identical (16,077,319 bytes both runs);
+MD5 differs only because Chrome's `page.pdf()` embeds a fresh
+`/CreationDate` + `/ModDate` per run (same ±27-byte timestamp
+jitter `docs/book.bat` output has always had).
+
+### Extending to the save side
+
+The shim covers the writers too, by symmetry. Three more methods:
+
+```
+PDFWriter.prototype.serializeToBuffer
+PDFWriter.prototype.computeBufferSize
+PDFStreamWriter.prototype.computeBufferSize
+```
+
+Only `serializeToBuffer` actually runs on our pipeline --
+`ParallelStreamWriter extends PDFStreamWriter` overrides
+`computeBufferSize` with its own three-phase parallel-deflate
+version (genuinely async because of `await Promise.all(deflated)`
+over libuv's thread pool, which we keep). But the inherited
+`serializeToBuffer` still had a dead `shouldWaitForTick` gate in
+its main loop. Same shape as the load side: per-object dispatch,
+no actual yield because `objectsPerTick` is effectively `Infinity`,
+but every iteration pays the generator-machine + Promise cost.
+
+`serializeToBuffer` stays `async` (it has to `await
+this.computeBufferSize()`, which is the genuinely-async override).
+The change is: drop the `__awaiter` / `__generator` wrapper, use
+ES `async function` with one real `await`, strip the
+`shouldWaitForTick` gate. `computeBufferSize` on both base and
+stream writers becomes fully synchronous (their only async
+ingredient was the same dead yield).
+
+Measured wins on the writer side: **none reliably above noise**.
+The save phase dropped from 0.56 s before the load-side patches
+to 0.48 s after, and the writer patches don't move it further
+(0.50 s in the post-extension profile, within the run-to-run
+band). No writer frame ever broke into the top 15 in the first
+place -- the overhead was real but distributed across
+unattributed scaffolding and `(program)` time, not big enough to
+register individually.
+
+The reason to ship it anyway is structural, not performance: with
+load patched, the only remaining
+`shouldWaitForTick` / `waitForTick` references in our hot path
+were on the save side, and leaving them would defeat the "rip out
+the machinery" intent. With the save patches landed, neither
+phase routes through tslib `__awaiter` scaffolding except where
+there's a legitimate `await` underneath.
+
+### Dropping the flags
+
+The companion change is to drop the `parseSpeed` / `objectsPerTick`
+options from all our call sites, since with the shim in effect
+neither does anything:
+
+- `docs/render-book.mjs` drops `parseSpeed: ParseSpeeds.Fastest`
+  from `PDFDocument.load` and `objectsPerTick: Infinity` from
+  `parallelSave`. The `ParseSpeeds` import goes with them.
+- `docs/lib/parallel-deflate.mjs` drops `objectsPerTick` from
+  `parallelSave`'s public options object and from
+  `ParallelStreamWriter`'s constructor parameters. `PDFWriter`'s
+  base constructor still takes `objectsPerTick` as positional
+  arg 2 -- vestigial after `fast-sync-load`, but we pass
+  `Infinity` explicitly to make the constructor chain happy.
+- `perf/measure.mjs` removes the same options from
+  `PDFDocument.load`, `parallelSave`, and `pdfDoc.save`.
+
+`perf/profile-roundtrip.mjs` keeps its `parseSpeed` /
+`objectsPerTick` knob comparison -- that file's whole purpose is
+to A/B pdf-lib's defaults against `Fastest`, and it runs against
+vanilla pdf-lib without the shim by design.
+
+## Replace `PDFDict`'s backing `Map` with a flat array
+
+With `fast-dict-iter` and `fast-parse-dict` both shipping, the
+process-phase CPU profile read tidy enough that the next move was
+to look at the *other* side of the ledger: the sampling heap
+profile rather than CPU. The motivating run, captured with the
+canonical heap command (`--heap-profile-process --heap-sampling
+512`):
+
+```
+   self_kb   self_%   function  @  source
+  54315.27   34.75%   set                                  (V8 builtin)
+  24804.17   15.87%   Map                                  (V8 builtin)
+  19488.12   12.47%   PDFObjectParser.parseArray
+  16786.41   10.74%   PDFParser.parseIndirectObjectHeader
+  15329.21    9.81%   PDFObjectParser.parseNumberOrRef
+   9599.45    6.14%   fastParseDict        (fast-parse-dict.mjs)
+   9581.25    6.13%   fastOf               (fast-decode-name.mjs)
+   ...
+```
+
+`set` and `Map` together at ~80 MB -- **half of all process-phase
+allocations** -- were the natural place to start.
+`find-heap-callers.mjs` attributed them cleanly:
+
+```
+$ node find-heap-callers.mjs process.heapprofile set
+set: total=53.04 MB
+  39107.27 KB   fastParseDict @ fast-parse-dict.mjs:62
+   7168.04 KB   PDFParser.parseIndirectObjectHeader
+   7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+    ...
+
+$ node find-heap-callers.mjs process.heapprofile Map
+Map: total=24.22 MB
+  24691.51 KB   fastParseDict @ fast-parse-dict.mjs:62
+    112.13 KB   buildPdfObjectsForOutline
+```
+
+84 % of the combined Map+set traffic was one site, the parser's
+per-dict accumulator inside `fastParseDict`:
+
+```js
+const dict = new Map();             // 24 MB of Map() constructors here
+while (...) {
+  const key = this.parseName();
+  const value = this.parseObject();
+  dict.set(key, value);             // 38 MB of set() entries here
+  ...
+}
+return PDFDict.fromMapWithContext(dict, this.context);
+```
+
+One `new Map()` + N `Map.prototype.set` calls per parsed dict,
+then the Map gets stored as `PDFDict.dict` and consulted later by
+all the `PDFDict` methods. Every Map allocates a header + a
+hash-table backing arena + per-entry bucket objects; on the book
+that's ~9 k dicts each paying for an arena it doesn't need,
+because PDF dicts are **tiny** (typical has <= 10 entries, most
+have 2-3) and nothing in pdf-lib's API touches a parsed dict
+often enough for the hash to pay back.
+
+The remaining 16 % was `context.assign` populating
+`PDFContext.indirectObjects` (a Map<PDFRef, PDFObject>) -- that's
+a single Map shared across the load, not addressed here.
+
+### The shape
+
+Replace the Map with a flat alternating array:
+
+```js
+// before
+this.dict = new Map([[key0, value0], [key1, value1], ...]);
+// after
+this.dict = [key0, value0, key1, value1, ...];
+```
+
+One allocation per dict (the array; the entries are stored inline
+in the array's backing store, no per-entry boxes). Lookups become
+linear scans:
+
+```js
+function indexOfKey(arr, key) {
+  for (let i = 0, len = arr.length; i < len; i += 2) {
+    if (arr[i] === key) return i;
+  }
+  return -1;
+}
+```
+
+For 5-entry dicts (the dominant size class), a 5-iteration linear
+scan with strict-equality comparison beats `Map.prototype.get`
+(which has to hash the key, then walk a hash-bucket chain) on
+every V8 microbench checked. The crossover is somewhere around
+20-30 entries; PDF dicts almost never get there.
+
+### Compatibility
+
+`PDFDict`'s public method surface is
+`.keys / .values / .entries / .set / .get / .has / .delete /
+.lookup / .lookupMaybe / .asMap / .clone / .toString /
+.sizeInBytes / .copyBytesInto / .uniqueKey`. Grepping the rest of
+pdf-lib confirmed every consumer goes through that surface --
+`viewerPrefs.dict.set(...)`, `widgetAnnot.dict.get(...)`,
+`xrefStream.dict.set(...)`, etc. all call `PDFDict.prototype.set`/
+`.get`, which we re-implement against the array. Nobody in the
+codebase touches `dict.dict` expecting Map-specific iterators.
+The single direct-Map use, `asMap()`, still returns a fresh
+`new Map(...)` for any caller that wants one.
+
+The seam factories that take a `Map` argument
+(`fromMapWithContext`, `withContextAndPages`, `PDFPageLeaf.clone`'s
+`new Map()` initializer) get small wrappers that convert at the
+boundary. They're called a handful of times per document --
+catalog + page tree + page leaves -- so the conversion is free
+relative to the parser's ~9 k dicts.
+
+### Subsumes two earlier shims
+
+The two existing dict-shape shims are no longer useful in front of
+the array shape:
+
+- `fast-dict-iter` patched `PDFDict.sizeInBytes` and `copyBytesInto`
+  to call `this.dict.forEach((value, key) => ...)` instead of
+  `Array.from(this.dict.entries())`. With `this.dict` as a flat
+  array, both methods become `for (let i = 0; i < arr.length; i += 2)`
+  -- no `forEach`, no `thisArg` context object, no callback
+  allocation.
+- `fast-parse-dict` patched `parseDict` to hoist the
+  Type/Catalog/Pages/Page sentinel `PDFName.of` calls into
+  module-level constants. The new `parseDict` (in
+  `fast-dict-array.mjs`) keeps the hoisted constants and also
+  accumulates into the flat array directly. The Type-sentinel
+  dispatch becomes a short linear scan over the array; PDF
+  convention places `/Type` at index 0 or 2, so it's effectively
+  O(1) per dict.
+
+`fast-dict-array.mjs` carries both behaviours inline. The two
+older shims stay in the tree as opt-in flags on `measure.mjs`
+(useful for A/B against the `Map` shape) but are mutually
+exclusive with `--fast-dict-array` (the harness errors if you
+combine them).
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+same canonical command otherwise):
+
+| Allocator        | Map shape (before) | Array shape (after) | Delta   |
+|------------------|-------------------:|--------------------:|--------:|
+| `set` builtin    |          54.3 MB   |             14.8 MB | -73 %   |
+| `Map` builtin    |          24.8 MB   |    < 1 MB (off top) | -96 %   |
+| `push` builtin   |              -     |              2.8 MB | +2.8 MB |
+| Total sampled    |         152.6 MB   |            140.1 MB | -8 %    |
+
+The total-allocation drop is smaller than the Map+set drop
+because the sampling profiler reattributes the array contents
+(`PDFObject` references that used to sit inside Map bucket
+allocations) to the `fastParseDictArray` frame that allocates the
+array -- the allocations are still there, just attributed
+differently. The **real** win is the absence of Map header +
+hash-table arena per dict, which the profile shows by the `Map`
+row collapsing.
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| Row                                | Before  | After   |
+|------------------------------------|--------:|--------:|
+| `(garbage collector)`              | 213 ms  | 170 ms  |
+| `fastParseDict` / `fastParseDictArray` | 113 ms  |  40 ms  |
+| `PDFDict.copyBytesInto` + `_copyBytesIntoEntry` | 60 ms |  26 ms  |
+
+Wall-clock (paired no-profile, 4 runs each, mean process phase):
+
+| Shape        | Process (mean) | Range          |
+|--------------|---------------:|----------------|
+| Map (before) |        1.180 s | 1.15 - 1.20 s  |
+| Array (after)|        1.132 s | 1.11 - 1.15 s  |
+
+**~48 ms saved on the 1.18 s process phase (~4 %).** The
+profile-time delta is bigger than the wall-clock delta because
+the CPU profiler's sampling overhead falls disproportionately on
+hot allocator paths -- a familiar caveat. The honest signal is
+the no-profile A/B.
+
+The output PDF is structurally identical (1651 pages, 1773
+outline nodes, same title / creator metadata), within the build's
+intrinsic timestamp/random-ID noise (the build is
+non-deterministic between runs anyway -- two consecutive no-shim
+runs differ by ~30 bytes too).
+
+`docs/render-book.mjs` swaps `./lib/fast-dict-iter.mjs` +
+`./lib/fast-parse-dict.mjs` for the single
+`./lib/fast-dict-array.mjs` import. The two older shims stay in
+the tree for A/B; the harness rejects combining them with
+`--fast-dict-array`.
+
+## Replace `PDFContext.indirectObjects` with a dense array
+
+With `fast-dict-array` shipping, the per-dict `new Map()` +
+`Map.prototype.set` traffic was gone -- but the heap profile
+still showed ~14.5 MB of `set` self-size. `find-heap-callers`
+localized it cleanly to one remaining site, attributed to two
+V8-inlined parent frames:
+
+```
+$ node find-heap-callers.mjs <post-dict-array>.heapprofile set
+set: total=14.49 MB
+  7168.04 KB   PDFParser.parseIndirectObjectHeader
+  7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+```
+
+Both rows are the same logical call: `this.indirectObjects.set(ref, object)`
+inside `PDFContext.assign` (`pdf-lib/.../PDFContext.js:34`), fired
+once per indirect object during load. On the book that's ~9 k
+entries; V8's Map grows the underlying hash table through ~14
+doubling steps to fit them (4 -> 8 -> ... -> 16384), discarding
+each intermediate arena. The 14 MB total is final arena + bucket
+allocations + all the discarded growth arenas.
+
+`PDFRef`s are overwhelmingly gen=0 (rare gen!=0 cases come from
+revisions / incremental updates). `fast-refs` already uses a
+dense array indexed by `objectNumber` for the **key** side --
+`PDFRef.of`'s gen=0 pool. The same trick applies on the **value**
+side for `indirectObjects`: dense array keyed by `objectNumber`.
+
+### The shim
+
+`docs/lib/fast-indirect-objects.mjs` patches
+`PDFContext.prototype.assign / lookup / lookupMaybe / delete /
+getObjectRef / enumerateIndirectObjects` to consult an auxiliary
+`this._objArr` (dense array indexed by `objectNumber`) for gen=0
+`PDFRef`s first, falling back to the original Map for gen!=0.
+Lazy init on first `assign` -- no constructor patching needed.
+The original Map sits at `this.indirectObjects` unchanged; gen=0
+entries skip it entirely.
+
+```js
+PDFContext.prototype.assign = function (ref, object) {
+  if (ref.generationNumber === 0) {
+    if (!this._objArr) this._objArr = [];
+    this._objArr[ref.objectNumber] = object;     // dense store, no Map
+  } else {
+    this.indirectObjects.set(ref, object);       // gen!=0 fallback
+  }
+  if (ref.objectNumber > this.largestObjectNumber) {
+    this.largestObjectNumber = ref.objectNumber;
+  }
+};
+```
+
+`lookup` / `lookupMaybe` resolve the ref the same way then run
+the original type-check tail verbatim. `delete` nulls the slot
+(not splices -- subsequent objectNumbers retain their slots).
+`getObjectRef` linear-scans the dense array first, then the Map.
+The interesting one is `enumerateIndirectObjects`: dense-array
+iteration is already in ascending objectNumber order, so when
+the gen!=0 Map is empty (the parsed-PDF common case) the method
+returns without sorting -- the upstream
+`Array.from(this.indirectObjects.entries()).sort(byAscendingObjectNumber)`
+becomes a single linear pass with no `Array.from` materialization
+and no sort.
+
+### Measured wins
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`,
+fast-dict-array baseline vs + fast-indirect-objects):
+
+| Row                     | Pre (ms) | Post (ms) | Note               |
+|-------------------------|---------:|----------:|--------------------|
+| (garbage collector)     |   162.50 |    176.83 | within noise       |
+| **PDFContext.assign**   | **41.83**| **out of top 15** | **drops off**  |
+| PDFRef.of               |   124.42 |    118.24 | within noise       |
+| Total profile duration  |  1.21 s  |   1.14 s  | -70 ms             |
+
+The headline is `PDFContext.assign` exiting the top 15.
+Everything else moves within the sample-count noise band.
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`):
+
+| Allocator         | Pre (KB)  | Post (KB) | Delta                |
+|-------------------|----------:|----------:|---------------------:|
+| `set` builtin     | 14 840.20 |  7 674.41 | -7 166 KB (-48 %)    |
+| Total sampled     | 140.15 MB |  135.00 MB| -5.15 MB (-3.7 %)    |
+
+The remaining 7 MB of `set` is **not** `PDFContext.assign`
+anymore -- `find-heap-callers` on the post profile shows it's the
+upstream `PDFRef.of`'s `pool.set(tag, instance)` on cache miss.
+Even with `fast-refs`'s dense-array short-circuit on the LOOKUP
+side, the first time each unique objectNumber is encountered the
+shim calls through to the original `PDFRef.of`, which constructs
+the `PDFRef` AND populates the upstream `Map<string, PDFRef>`
+pool. That's the next target.
+
+## Skip `PDFRef` `pool.set` on the gen=0 miss path
+
+With `fast-indirect-objects` shipping, the heap profile showed
+one last hot `set` source: the upstream `PDFRef.of`'s own pool
+(`pdf-lib/.../objects/PDFRef.js:34`):
+
+```js
+PDFRef.of = function (objectNumber, generationNumber) {
+    ...
+    var tag = objectNumber + " " + generationNumber + " R";
+    var instance = pool.get(tag);
+    if (!instance) {
+        instance = new PDFRef(ENFORCER, objectNumber, generationNumber);
+        pool.set(tag, instance);                  // ← 7 MB of set on the book
+    }
+    return instance;
+};
+```
+
+`fast-refs` already short-circuited the LOOKUP side with a dense
+array indexed by `objectNumber`. But on a gen=0 cache miss (~9 k
+unique objectNumbers per book), the shim was calling
+`original.call(PDFRef, objectNumber, 0)`, which dutifully built
+the tag string, looked it up in the upstream Map, missed,
+allocated a new `PDFRef`, AND populated the upstream pool --
+redundantly, since the dense array `pool0` is the authoritative
+cache from now on.
+
+Each `pool.set` over the load grew the Map's hash table through
+~14 doubling steps (4 -> 8 -> ... -> 16384), discarding each
+intermediate arena. Total: ~7 MB of `set` self-size in the heap
+profile, plus the matching ~93 ms of `PDFRef.of` CPU self-time
+(the function body that does the set is hot enough that V8
+charges all that growth to `PDFRef.of`'s frame).
+
+### The upgrade
+
+Replace the original-delegation on the gen=0 miss path with
+direct construction:
+
+```js
+PDFRef.of = function fastOf(objectNumber, generationNumber) {
+  if (generationNumber === undefined || generationNumber === 0) {
+    const existing = pool0[objectNumber];
+    if (existing) return existing;
+    const fresh = Object.create(PDFRef.prototype);
+    fresh.objectNumber = objectNumber;
+    fresh.generationNumber = 0;
+    fresh.tag = objectNumber + ' 0 R';
+    pool0[objectNumber] = fresh;
+    return fresh;
+  }
+  return original.call(PDFRef, objectNumber, generationNumber);
+};
+```
+
+Safety: `PDFRef`'s super class (`PDFObject`) has a no-op
+constructor (`pdf-lib/.../PDFObject.js:5`) so skipping
+`_super.call(this)` is fine. The only instance fields the
+prototype methods read are `objectNumber`, `generationNumber`,
+and `tag` (used by `toString` / `sizeInBytes` / `copyBytesInto`);
+direct field init covers them. The `ENFORCER` check exists to
+make `PDFRef.of` the single legitimate factory -- we already are
+that factory, so bypassing it doesn't violate any invariant.
+
+gen!=0 keeps the original delegation (rare on freshly-parsed
+PDFs; its `Map.set` traffic is negligible at gen!=0 volume).
+
+### Measured wins
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`,
+fast-indirect-objects baseline vs + this upgrade):
+
+| Row                  | Pre (ms) | Post (ms) | Note                       |
+|----------------------|---------:|----------:|----------------------------|
+| (garbage collector)  |   176.83 |    166.71 | -10 ms                     |
+| **PDFRef.of**        | **118.24** | **out of top 15** | **drops off (~93 ms saved)** |
+| fastOf @ fast-refs   |        - |     25.19 | new row (was inside `PDFRef.of`) |
+| Total profile        |  1.14 s  |   1.03 s  | -110 ms (-9.6 %)           |
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`):
+
+| Allocator         | Pre (KB)  | Post (KB) | Delta                |
+|-------------------|----------:|----------:|---------------------:|
+| `set` builtin     |  7 674.41 |    504.77 | **-7 170 KB (-93 %)** |
+| fastOf @ fast-refs|  9 367.39 |  7 734.79 | -1 633 KB             |
+| Total sampled     | 135.00 MB | 123.11 MB | -11.89 MB (-8.8 %)    |
+
+The residual 504 KB of `set` is `fastCache.set` in `PDFName`
+interning (~448 KB) plus a sliver of `__awaiter` machinery in
+`PDFDocument`; both are static-size and harmless. There is no
+longer any materially-hot `Map.prototype.set` in the process-phase
+heap profile.
+
+The edit is local to `docs/lib/fast-refs.mjs`; no production
+import change needed since `fast-refs` was already wired up.
+
+## Pool `PDFNumber` instances by value
+
+With every `Map.set` in the load path either eliminated or
+reduced to its irreducible floor (`PDFName` fastCache, ~0.5 MB),
+the next-largest bucket in the heap profile was
+`parseNumberOrRef` at 15 MB -- mostly inlined `new PDFNumber(value)`
+from the parser's number branch:
+
+```js
+function PDFNumber(value) {
+  var _this = _super.call(this) || this;
+  _this.numberValue = value;
+  _this.stringValue = numberToString(value);     // alloc per instance
+  return _this;
+}
+PDFNumber.of = function (value) { return new PDFNumber(value); };
+```
+
+No pool. Every `PDFNumber.of(N)` returns a fresh instance, even
+for the same `N`. PDFs reuse a handful of integer values
+*constantly*: the book has 1 651 page entries (each contributing
+`/MediaBox` dimensions like 612, 792, integer indices, `/Count`,
+`/N` totals), plus content-stream numeric literals, font sizes,
+and bit widths. Hundreds of thousands of `PDFNumber.of` calls
+against maybe a few thousand unique values.
+
+A `PDFNumber` is also conceptually immutable: `numberValue` and
+`stringValue` are written once in the constructor and never
+mutated. Pooling by value is therefore safe.
+
+### Could we just store a raw `number`?
+
+In principle yes. `PDFNumber` exists structurally to satisfy
+pdf-lib's polymorphic dispatch on every dict / array value
+(`value.copyBytesInto(buffer, offset)`, `value.sizeInBytes()`,
+`value.asNumber()`). Replacing it with a primitive would
+require:
+
+- Type-branching in `PDFDict.copyBytesInto` /
+  `PDFArray.copyBytesInto` / `sizeInBytes`: `typeof === 'number'`
+  fast-path that writes the number's string form directly.
+- Updating ~53 consumer sites in pdf-lib's API code (everything
+  that does `lookup(name, PDFNumber).asNumber()` or
+  `value instanceof PDFNumber`) to handle bare numbers.
+- A V8 deopt risk: the serializer's previously-monomorphic
+  `.copyBytesInto` call site becomes polymorphic across two
+  representations.
+
+That's a much bigger surgery for a similar magnitude of win,
+because pooling already collapses every repeated-value
+allocation to a single shared instance. So we ship the pool
+first; if a post-pool heap profile still showed `PDFNumber` as a
+top allocator, stripping would have been worth the API surgery.
+It doesn't.
+
+### The shim
+
+`docs/lib/fast-pdfnumber-pool.mjs` installs the cache. Same
+shape as `fast-refs`: dense array indexed by `value` for
+non-negative integers in `[0, 16384)` (covers every observed
+integer value in the book by a wide margin), Map fallback for
+floats, negatives, and out-of-range integers. Map's
+`SameValueZero` handles `NaN` / `-0` correctly, no special-casing
+needed.
+
+```js
+PDFNumber.of = function fastNumberOf(value) {
+  if (value >= 0 && value < POOL_SIZE && (value | 0) === value) {
+    let pn = intPool[value];
+    if (pn !== undefined) return pn;
+    pn = original.call(PDFNumber, value);
+    intPool[value] = pn;
+    return pn;
+  }
+  let pn = otherPool.get(value);
+  if (pn !== undefined) return pn;
+  pn = original.call(PDFNumber, value);
+  otherPool.set(value, pn);
+  return pn;
+};
+```
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-refs upgrade baseline vs + pool):
+
+| Allocator                | Pre (KB)  | Post (KB)            | Delta                |
+|--------------------------|----------:|---------------------:|---------------------:|
+| **parseNumberOrRef**     | 15 388.73 | **out of top 10**    | **-15+ MB**          |
+| `String` builtin         |  1 202.23 | out of top 10        | -                    |
+| `PDFNumber.of` (pool miss)|        - |               818.92 | new, ~unique count   |
+| Total sampled            | 123.11 MB |            107.21 MB | **-15.9 MB (-13 %)** |
+
+`parseNumberOrRef`'s row collapsed off the top 10. The new
+`PDFNumber.of` row at 0.8 MB is the floor -- one `PDFNumber` per
+unique value across the whole load. The `String` builtin row
+(`stringValue` allocations) also collapsed because they're now
+allocated once per unique value, not once per use site.
+
+CPU profile (same paired methodology): GC self-time effectively
+flat (166.71 ms -> 165.54 ms), total profile duration within
+sample-count noise (1.03 s -> 1.09 s). Pool cost per call is a
+branch + array index, which V8 inlines into the hot
+`parseNumberOrRef` path. CPU is a wash; the win is pure heap.
+
+### A companion analyzer: `find-heap-callees.mjs`
+
+Adding this shim also surfaced the question "what's
+`fastParseDictArray` actually allocating at its 58 MB self-row?".
+`find-heap-callers` answers "who calls X?"; the inverse --
+"what does X allocate?" -- needed a new tool. `find-heap-callees.mjs`
+walks the `.heapprofile` tree and lists a target frame's direct
+children with their (self + subtree) byte totals.
+
+Used here, it cracked open the `fastParseDictArray` row: most of
+the 58 MB was recursive `parseDict` invocations across nesting
+levels, not a single allocator. That's intrinsic to the document
+structure (page-tree dicts contain Kids arrays of Page dicts that
+contain Resources dicts...), not something a shim can shrink.
+The tool stays for future investigations.
+
+## Pre-size `parseDict`'s backing array
+
+After `fast-pdfnumber-pool` shipped, `fastParseDictArray` was
+53 % of the residual heap profile (~58 MB self-size). Three
+components in that frame:
+
+```js
+const arr = [];                                  // (1) array alloc + cap-4 FixedArray
+while (...) {
+  arr.push(key, value);                          // (2) growth via doubling
+}
+return new PDFDict(arr, this.context);           // (3) PDFDict instance
+```
+
+Without per-call counts, the 58 MB could plausibly be 10 k huge
+dicts or 300 k tiny ones. So we instrumented
+(`perf/instrument-parsedict.mjs`), which wraps the shim's
+`parseDict` to count invocations and size-distribution on exit.
+The book's workload:
+
+```
+total calls       : 260 967
+total entries     : 1 170 264
+avg entries/dict  : 4.48
+max entries/dict  : 4 353
+max recursion     : 3
+entries-per-dict histogram:
+     1 :     822
+     2 :  22 551
+     3 :  13 372
+     4 :  73 936    (28 %)
+     5 : 135 438    (52 %)   <-- median
+     6 :     231
+     7 :  12 458
+     8 :   1 644
+     9..31:  ~530
+   32+ :       2
+```
+
+**80 % of dicts have exactly 4 or 5 entries; 96 % have <= 7. Max
+recursion only 3 deep.** That maps cleanly onto V8's array
+growth behavior: a 5-entry dict's `arr.push(key, value)` chain
+grows the backing FixedArray from cap 4 -> 8 -> 16, discarding
+the two intermediate stores as garbage:
+
+| Dict entries | Push slots | Growth path | FixedArray bytes (incl. discards) |
+|-------------:|-----------:|-------------|----------------------------------:|
+|  4 (28 %)    |   8        | 4 -> 8      | 64 + 96 = 160 B                   |
+|  5 (52 %)    |  10        | 4 -> 8 -> 16 | 64 + 96 + 152 = 312 B           |
+|  7 (5 %)     |  14        | 4 -> 8 -> 16 | 312 B                             |
+|  2 (9 %)     |   4        | 4           | 64 B                              |
+
+Weighted average ~220 B of FixedArray throughput per dict.
+Across 261 k dicts: ~57 MB -- matching the observed 58 MB
+self-row almost exactly. **~85 % of the row is growth garbage
+from not pre-sizing.**
+
+### The fix
+
+Allocate the accumulator at the median size up front and use
+direct indexing with a `len` counter; fall back to push only for
+the rare overflow case.
+
+```js
+// Pre-sized permanent backing array (not a scratch buffer --
+// the array is what we hand to PDFDict, just with capacity set
+// to the median dict size up front to skip the growth chain).
+const INITIAL_SLOTS = 10;   // median = 5 entries = 10 push slots
+const arr = new Array(INITIAL_SLOTS);
+let len = 0;
+while (...) {
+  const key = this.parseName();
+  const value = this.parseObject();
+  if (len < INITIAL_SLOTS) {
+    arr[len]     = key;
+    arr[len + 1] = value;
+  } else {
+    arr.length = len;
+    arr.push(key, value);   // rare: 7+ entry dicts grow from 10
+  }
+  len += 2;
+}
+arr.length = len;            // trim hole tail
+```
+
+### Picking `INITIAL_SLOTS`
+
+`INITIAL_SLOTS = 16` was the first try (covers 4-7 entry dicts
+without growth -- 96 % of cases). It saved only ~5.6 MB instead
+of an estimated ~22 MB. The reason: `new Array(16)` allocates a
+176-byte FixedArray *for every dict*, including the 9 % of
+2-entry dicts that previously needed only 64 bytes. The cap-16
+baseline is itself ~46 MB across 261 k calls.
+
+`INITIAL_SLOTS = 10` is exact-fit for the 52 % dominant 5-entry
+case (no growth, no waste), small waste for 2/3/4-entry dicts
+(4-6 unused slots), and one growth for the 5 % at 7 entries
+plus the ~2 % above that. Best balance for this workload.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+post-fast-pdfnumber-pool baseline vs + `INITIAL_SLOTS = 10`):
+
+| Allocator                | Pre (KB)   | Post (KB)  | Delta              |
+|--------------------------|-----------:|-----------:|-------------------:|
+| **fastParseDictArray**   |  58 203.30 |  43 817.77 | **-14.4 MB (-25 %)** |
+| `push` builtin           |   2 843.44 |   1 621.62 | -1.2 MB            |
+| Total sampled            | 107.21 MB  |  92.13 MB  | **-15.1 MB (-14 %)** |
+
+Two-step path through `INITIAL_SLOTS`:
+
+| Step                        | Total sampled | fastParseDictArray |
+|-----------------------------|--------------:|-------------------:|
+| No pre-size                 |     107.21 MB |          58.20 MB  |
+| `INITIAL_SLOTS = 16`        |     101.61 MB |          55.03 MB  |
+| `INITIAL_SLOTS = 10`        |  **92.13 MB** |       **43.82 MB** |
+
+### What about a true scratch buffer?
+
+The "escalation" alternative was a single long-lived backing
+array on the parser instance, append-then-slice per dict. That
+would actually be a scratch buffer -- reused across calls,
+sliced off into a fresh `PDFDict` storage per dict. It would
+eliminate the per-call `new Array(10)` allocation. But the slice
+result is still a fresh per-dict allocation, sized exactly --
+which for the median 5-entry case is ~104 B (same as cap-10).
+The only net savings would be on small dicts (1-3 entries)
+where the slice is smaller than 10 slots; that's maybe ~2-3 MB
+across 36 k small dicts. Not worth the recursion-safe
+length-pointer save/restore plumbing.
+
+The edit is local to `docs/lib/fast-dict-array.mjs`; no
+production import change needed since `fast-dict-array` was
+already wired up. The `--instrument-parsedict` flag stays on
+`measure.mjs` for future dict-workload investigations.
+
+## View-based PDFDict (explored, didn't ship)
+
+After fast-dict-array pre-sized its per-dict accumulator to median
+size, the `fastParseDictArray` row was still 43.8 MB on the heap
+profile (48 % of total) -- mostly the irreducible floor of "one
+`new Array(10)` + one PDFDict instance per parsed dict, 261 k
+times". The natural next move: stop allocating per-dict storage at
+all, share one backing array across many dicts via a `(buf, start,
+end)` view.
+
+Prototyped as `fast-dict-view.mjs`. Each PDFDict carried a `(buf,
+start, end)` window into a parser-wide per-depth shared array,
+append-only across all dicts at that depth. The win on heap was
+only ~2.5 MB -- the fatter PDFDict instance (5 fields vs 2) ate
+back most of the buffer-sharing saving. Subsequently superseded
+by the one-buffer approach below, which packs the entire dict
+storage into a single mainBuf and shrinks the PDFDict instance
+back down. The view-based shim doesn't ship; the notes here are
+preserved as the thinking that led to one-buffer.
+
+### Why "not scratch"
+
+The earlier comment about "scratch buffer" was wrong vocabulary.
+A scratch buffer is a temporary workspace -- you write, use, and
+discard. Nothing here qualifies: every parsed entry lives until
+the PDFDocument is dropped. What we actually want is a *shared
+backing array* where each PDFDict claims a contiguous range,
+written once and kept. The buffer is append-only; slots are never
+rewritten.
+
+### The recursion gotcha
+
+A naive single shared buffer breaks under parseDict recursion. If
+outer parseDict appends entries to `buf` while parsing a value
+that recurses into inner parseDict, inner's entries get
+interleaved into outer's range. Outer's view would wrongly
+include inner's entries:
+
+```
+outer parseDict starts at len=0
+  outer parses keyA, valueA       -> buf[0,1], len=2
+  outer parses keyB, value=<<...>> -> calls inner parseDict
+    inner appends 3 entries        -> buf[2..7], len=8
+    inner returns view {start:2, end:8}
+  outer wants to write keyB,valueB at buf[8,9] -> len=10
+  outer parses keyC,valueC         -> buf[10,11], len=12
+outer's range: {start:0, end:12}  ← includes inner's entries!
+```
+
+Fix: **one buffer per parseDict-recursion-depth**, not one shared
+globally. Instrumentation
+([perf/instrument-parsedict.mjs](../instrument-parsedict.mjs))
+showed max parseDict depth = 3 on the book, so 3-4 buffers per
+parser. Each buffer is append-only across all dicts at that depth.
+Inner recursion writes to a different buffer than outer, so
+outer's range stays contiguous.
+
+### Copy-on-write for mutations
+
+Shared buffers are correct as long as nobody mutates the entries.
+But `pdfDoc.catalog.set(PDFName.of('Outlines'), outlineRef)` does
+happen in our pipeline (during setOutline). The shim added a COW
+hook to `PDFDict.prototype.set` and `.delete`: first mutation
+copies the (start..end) range into a private array, swaps the
+view to point at that copy with `_dictOwned = true`. Subsequent
+mutations on that dict operate in place. Other dicts sharing the
+original buffer are unaffected.
+
+### Pre-sizing the per-depth buffers
+
+Without pre-sizing the per-depth buffers, V8 doubles their
+backing FixedArray from cap 0 up to (depth 0 case) ~2.1M slots --
+~20 doublings, with each old arena becoming garbage. That growth
+garbage alone was 6.5 MB of the regression observed when first
+prototyping.
+
+Instrumented to measure the final per-depth lengths on a book
+parse:
+
+```
+=== fast-dict-view: depth stats ===
+parser instances seen: 1
+  depth 0: total 2 155 544 slots, max-per-parser 2 155 544 slots
+  depth 1: total   158 260 slots, max-per-parser   158 260 slots
+  depth 2: total    26 724 slots, max-per-parser    26 724 slots
+```
+
+Hardcoded the caps + 10 slack in the shim's `DEPTH_BUF_CAPS`,
+sized to skip all growth on the book. For other workloads the
+buffers grow naturally from these starting sizes;
+oversizing-by-2x doesn't hurt much because there's only one
+buffer per depth per parser.
+
+### Bug-hunt: the depth-reset gotcha
+
+The first version of the shim used `if (!this._dictDepth)` to
+lazy-init the per-parser buffer stack. `!this._dictDepth` is true
+when `_dictDepth = 0` -- which is exactly the state at the *end*
+of every top-level parseDict call (the depth counter was just
+decremented back to zero). The buffers were getting reset between
+every top-level dict; each one was effectively allocating fresh.
+
+Fix: `if (this._dictBufs === undefined)` -- explicit
+undefined-on-construction check. Easy to spot in retrospect, less
+easy to spot when looking at a regression that doesn't make
+sense.
+
+### Why the win is "only" 2.5 MB
+
+Even with perfect pre-sized buffers and the bug fix, fast-dict-view
+beats fast-dict-array by only ~2.5 MB on heap. The expected
+saving was bigger -- one shared buffer should beat 261 k separate
+ones by a lot.
+
+The reason: the PDFDict *instance* in fast-dict-view is itself
+larger. Where fast-dict-array stores `{dict, context}` (2 named
+slots, ~32 B per instance with V8's inline-properties packing),
+fast-dict-view stores `{_dictBuf, _dictStart, _dictEnd, _dictOwned,
+context}` (5 named slots, ~96 B per instance). Across 261 k
+dicts that's ~16 MB of extra per-instance storage that offsets
+most of the buffer-sharing win:
+
+| Per-dict allocation | fast-dict-array (INITIAL_SLOTS=10) | fast-dict-view (pre-sized) |
+|---------------------|-----------------------------------:|----------------------------:|
+| Backing storage     | 104 B per-dict `new Array(10)`     | ~16 B share of shared buf   |
+| PDFDict instance    | ~32 B (inlined constructor)        | ~96 B (Object.create + 5 fields) |
+| **Total / dict**    |                          **~136 B**|                  **~112 B** |
+
+The buffer sharing saves ~88 B per dict on storage, but the
+fatter PDFDict instance eats ~64 B back. Net ~24 B per dict =
+~6 MB structural win, of which ~2.5 MB shows in the heap profile
+after V8 internal overhead variance.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-array baseline vs + fast-dict-view):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta          |
+|------------------------------------|----------:|----------:|---------------:|
+| `fastParseDictArray` / `*View`     |  43 817.77 |  40 955.37 | -2.86 MB       |
+| Total sampled                      |  92.13 MB  |  89.68 MB  | **-2.45 MB**   |
+
+Modest. The takeaway is structural: a view-based shape is the
+right direction, but the PDFDict instance shape itself is now
+the dominant per-dict cost -- so the next prototype needs to
+shrink the instance too. That's the one-buffer + packed-pointer
+work in the following sections.
+
+## Single-double PDFDict (explored, didn't ship)
+
+fast-dict-view's win was capped by the PDFDict instance footprint:
+5 named slots (`_dictBuf`, `_dictStart`, `_dictEnd`, `_dictOwned`,
+`context`) at ~96 B per instance. Across 261 k dicts that's ~25 MB
+of per-dict object header.
+
+The instance shape is what was costing us. Most of those fields are
+small: `start` fits in 22 bits, `length` in 14 bits, `bufIdx` in
+~15 bits (counting setOutline's owned dicts), `owned` is 1 bit. The
+fields that *can't* obviously be made small are the `buf` and
+`context` *references* -- but `buf` already gets reference-by-index
+in fast-dict-view's design (via `_buffers[bufIdx]`), and `context`
+is a *singleton* in our pipeline.
+
+Prototyped as `fast-dict-double.mjs`. The idea: pack the whole
+instance state into one 53-bit Number stored as PDFDict's single
+`d` field, and treat the PDFContext as a module-level singleton.
+Heap dropped 90 MB → 84 MB (-6 MB / -7 %); GC self-time
+166.7 ms → 128.8 ms (-23 %). Promising, but the next move --
+also packing the entries into one shared buffer -- gives a
+cleaner overall shape and made fast-dict-double an opt-in
+stepping stone rather than a shipping target. The shim doesn't
+ship; the notes here document the design.
+
+### One PDFContext per process
+
+PDFContexts are created by `PDFParser.forBytesWithOptions` inside
+`PDFDocument.load`. In our pipeline `PDFDocument.load` is called
+exactly once per build (in `docs/render-book.mjs`), so exactly one
+PDFContext exists during the process phase. The shim stashed that
+one PDFContext in a module-level `_singletonContext` variable; the
+`PDFDict.prototype.context` getter just returned it. Any second
+distinct context would throw -- intentional bailout for workloads
+this shim isn't a fit for (e.g. merging two PDFs in one process).
+
+### 53-bit packed layout
+
+That leaves everything else fitting in one Number:
+
+```
+bits  0-21: start   (22 bits, max 4 M slots; depth-0 hits 2.16 M)
+bits 22-35: length  (14 bits, max 16 384 slots; max observed 8 706)
+bits 36-50: bufIdx  (15 bits, max 32 768 buffers; book uses ~1 800
+                    once setOutline creates per-outline-node
+                    owned dicts via the factory)
+bit  51   : owned flag
+bit  52   : spare
+```
+
+Stored as a single `d` field on each PDFDict instance. Reads use a
+mix of bitwise (for fields entirely below bit 32) and arithmetic
+(for fields straddling or above 32, since JS bitwise ops cast to
+int32):
+
+```js
+function _start(d)  { return d & MASK_22; }                  // bitwise
+function _length(d) { return Math.floor(d / POW_22) & MASK_14; }
+function _bufIdx(d) { return Math.floor(d / POW_36) & MASK_15; }
+function _owned(d)  { return Math.floor(d / POW_51) & 1; }
+```
+
+Writes:
+
+```js
+function pack(start, length, bufIdx, owned) {
+  if (start  >= MAX_START)  throw new Error('start overflow');
+  if (length >= MAX_LENGTH) throw new Error('length overflow');
+  if (bufIdx >= MAX_BUFIDX) throw new Error('bufIdx overflow');
+  return start + length * POW_22 + bufIdx * POW_36 + (owned ? POW_51 : 0);
+}
+```
+
+Overflow guards: if any field exceeds its budget, the shim throws
+with a clear message. The budgets are sized 2-5x the book's
+observed workload, so this is a guardrail for surprise inputs
+rather than a hot path.
+
+### V8 representation
+
+A property whose values consistently fall outside Smi range (which
+`d` does, since `bufIdx * 2**36` immediately exceeds 2^31) gets
+stored either inline as DoubleField (8 B inline double) or via
+TaggedField (8 B pointer + ~16-24 B HeapNumber). Empirically the
+heap drop was consistent with most instances using DoubleField:
+the `fastParseDictView` row's combined self+`_makeFromView` self
+dropped from 40.96 MB to 35.34 MB (an extra ~5 MB beyond what
+plain buffer-sharing achieved).
+
+### Subclasses
+
+PDFCatalog and PDFPageTree add no instance fields beyond `d`.
+PDFPageLeaf still needs `normalized` and `autoNormalizeCTM` as
+separate slots; that's ~1.6 k page leaves out of 261 k total dicts
+on the book, a small fraction.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-view baseline vs + fast-dict-double):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta             |
+|------------------------------------|----------:|----------:|------------------:|
+| `fastParseDictView` / `*Double`    |  40 955.37 |  18 913.63 | -22.0 MB         |
+| `_makeFromView` (separate child row)|    773.09 |  16 429.68 | +15.7 MB         |
+| Combined (fastParse* + _makeFromView)| 41 728.46 |  35 343.31 | **-6.4 MB**      |
+| Total sampled                      |  89.68 MB |  83.68 MB | **-6.0 MB (-7 %)** |
+
+(`_makeFromView` shows up as a bigger separate row because V8
+de-inlined it slightly differently for fast-dict-double, but the
+combined "PDFDict construction overhead" dropped ~6 MB.)
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| Row                          | Pre (ms) | Post (ms) | Delta                |
+|------------------------------|---------:|----------:|---------------------:|
+| (garbage collector)          |   166.71 |    128.81 | **-37.9 ms (-23 %)** |
+| `fastParseDictView` / `*Double` |    28.95 |     44.36 | +15.4 ms (incl COW + pack/unpack) |
+| Total profile duration       |   1.03 s |    0.97 s | -60 ms (-6 %)        |
+
+The GC self-time drop is the headline: less heap allocation
+directly translates to less GC work. The fastParseDict* row went
+up a bit (more arithmetic in unpack), but the saving on GC and
+elsewhere comfortably outweighs it.
+
+### Cumulative arc
+
+Starting from the original Map-backed PDFDict:
+
+| State                            | Total sampled | Change vs prior |
+|----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array) |   152 MB      | -               |
+| fast-dict-array (INITIAL_SLOTS=10)|    92 MB     | -60 MB          |
+| fast-dict-view (shared buffers)  |    90 MB      | -2 MB           |
+| **fast-dict-double**             |    **84 MB**  | **-6 MB**       |
+
+**-45 % cumulative reduction in process-phase heap traffic.**
+
+### Caveats
+
+- **Single context assumption.** If you load a second PDFDocument
+  in the same process the shim throws. For our build pipeline this
+  is fine; for general pdf-lib use a multi-context variant would
+  need an array + small ctxIdx field.
+- **Bit budgets.** Sized for the book and similar PDFs. A PDF with
+  a top-level dict count exceeding 4 M entries (very large book or
+  pathological generator) would trip the start budget; a PDF with
+  a single dict larger than 8 192 entries would trip length;
+  setOutline producing more than 32 k owned dicts would trip
+  bufIdx. All three are deliberate guards rather than expected
+  failures.
+- **Arithmetic in hot path.** Each read of a high-bit field is one
+  `Math.floor(d / 2**n) & mask`. V8 optimizes division by
+  powers-of-2 well, but it's not free. The 23 % GC drop is the
+  empirical confirmation that the heap savings outweigh the
+  unpack cost.
+
+The next prototype (one-buffer PDFDict) keeps the
+"packed-into-Number" idea but moves the entries themselves into
+a single per-parser mainBuf, which folds the bufIdx field away
+and lets a tighter bit layout track the (mainBuf-relative) start
++ length directly. That's what ends up shipping.
+
+## One-buffer PDFDict
+
+After the fast-dict-double prototype, the heap picture showed
+~1 780 backing arrays in flight: 3 per-depth parser buffers,
+~1 773 owned buffers created by setOutline's factory calls (one
+per outline node), plus a few during save. Each owned buffer
+has Array-header overhead; each parser-buffer needed its own
+slot in the `_buffers` registry. And `bufIdx` in the packed
+value had to be wide enough to address all of them -- 15 bits.
+
+Using **one buffer** for every committed PDFDict entry across the
+whole document would:
+
+- drop ~1 780 Array headers to 1
+- drop `bufIdx` from the packed value entirely (always 0)
+- keep all dict data in contiguous memory (better cache behavior)
+
+This is what ships as
+[fast-dict-onebuf.mjs](../../docs/lib/fast-dict-onebuf.mjs). It
+takes the place of fast-dict-array on the production import in
+`render-book.mjs`. Earlier dict-shape shims (fast-dict-array,
+fast-dict-iter, fast-parse-dict) stay in the tree as A/B
+baselines; the harness mutex rejects combining them.
+
+### The recursion gotcha (again)
+
+A single shared buffer breaks naive parseDict recursion exactly
+like it did when the view-based prototype first hit the same
+question: inner recursion writes into the middle of outer's
+entries, breaking outer's contiguous range.
+
+The fix is a **two-area split**:
+
+- `main` -- one long-lived buffer for committed entries. Append-only.
+- `temp` -- small per-parser working area for active parseDict
+  frames. Reused across all parseDict calls on the parser.
+
+```
+parseDict invocation (at any recursion depth):
+  frameStart = temp.length
+  while (parsing) {
+    key   = parseName()
+    value = parseObject()      // may recurse; temp grows then pops
+    temp.push(key, value)       // ON TOP of anything recursion left
+  }
+  // Commit this frame to main in one contiguous append
+  start = main.length
+  for entry in temp[frameStart..temp.length]:
+    main.push(entry)
+  // Pop our frame off temp
+  temp.length = frameStart
+  return PDFDict with view (start, length)
+```
+
+Outer's entries stay parked in `temp[frameStart..]` while inner
+recurses. Inner appends ON TOP of outer, commits its frame to
+`main` in one append, and pops its frame off `temp`. Outer's
+frame is intact at the top of `temp` again; outer continues
+pushing. When outer commits, its entries are contiguous in `temp`
+and commit contiguously to `main`. Outer's and inner's ranges in
+`main` are at distinct, non-overlapping offsets.
+
+`temp` is tiny -- max recursion depth × max single-dict size = a
+couple dozen slots peak on the book.
+
+### Mutations
+
+The shared (parser-created) range is read-only after parse. The
+ownership flag in `d` distinguishes shared from owned dicts:
+
+- **`set` with existing key**: in-place replace at `main[start +
+  i + 1]`. Safe for both shared and owned; no shifts.
+- **`set` with new key, dict at main's high-water mark**: just
+  `main.push(key, value)` and extend the range by 2. Common for
+  owned dicts that have just been created and are being filled
+  with `.set` calls (the outline construction pattern).
+- **`set` with new key, dict not at high-water mark**: COW. Copy
+  the range to `main`'s tail, append new pair, update encoded
+  value. Happens when other dicts were created between this dict's
+  creation and the `.set` call.
+- **`delete`**: always COW (shifting slots in `main` would corrupt
+  other dicts that point into the affected region).
+
+For setOutline's pattern -- create outline dict, recurse to build
+children, then call `.set(Prev/Next/First/Last/Count)` on it --
+the first `.set` after the recursion COWs the dict to the tail.
+Subsequent `.set`s on the same dict extend in place. Net: ~one
+COW per outline dict, ~5 entry copies each = ~9 k pair copies
+total. Negligible.
+
+### Bit layout
+
+With `bufIdx` gone, the packed value shrinks:
+
+```
+bits  0-23: start  (24 bits, max 16 M slots in main)
+bits 24-37: length (14 bits, max 16 384 slots; max observed 8 706)
+bit  38   : owned flag
+bits 39-52: spare (14 bits)
+```
+
+37 bits used. Still above Smi range (so V8 stores `d` as a
+DoubleField or HeapNumber), but with plenty of headroom and a
+much cleaner layout.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-double baseline vs + fast-dict-onebuf):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta             |
+|------------------------------------|----------:|----------:|------------------:|
+| `fastParseDictDouble` / `*OneBuf`  |  18 913.63 |       — (out of top 10) | **-18.9 MB**   |
+| `_makeFromView` / `_makeFromRange` |  16 429.68 |  16 613.10 | flat              |
+| PDFObjectParser.parseArray         |  19 502.52 |  19 512.08 | flat              |
+| Total sampled                      |  83.68 MB |  65.55 MB | **-18.1 MB (-22 %)** |
+
+The dominant change: `fastParseDictDouble` had 18.9 MB of self-
+attributed allocations (the 3 parser per-depth buffers' growth +
+the per-dict array creation in factory paths). With fast-dict-
+onebuf, those are gone entirely -- everything appends to `main`,
+which is allocated once.
+
+CPU profile (same paired methodology, with the wall-clock-is-noisy
+caveat):
+
+| Row                              | Pre (ms) | Post (ms) | Delta              |
+|----------------------------------|---------:|----------:|-------------------:|
+| (garbage collector)              |   128.81 |    151.05 | +22.2 ms           |
+| `fastParseDictDouble` / `*OneBuf` |    44.36 |     53.44 | +9.1 ms            |
+| Total profile duration           |   0.97 s |    1.05 s | +80 ms (~8 %, within machine noise) |
+
+GC self-time bumped up a bit. The `main` buffer is one giant
+~19 MB live object now; V8's mark phase scans it every cycle even
+though we're allocating less new garbage. Heap throughput went
+down 22 %, but live-heap mark cost went up modestly. On this
+machine wall-clock isn't a reliable signal anyway; the heap
+reduction is the headline.
+
+### Cumulative arc
+
+| State                            | Total sampled | Change vs prior |
+|----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array) |   152 MB      | -               |
+| fast-dict-array                  |    92 MB      | -60 MB          |
+| fast-dict-view  (explored)       |    90 MB      | -2 MB           |
+| fast-dict-double (explored)      |    84 MB      | -6 MB           |
+| **fast-dict-onebuf**             |    **66 MB**  | **-18 MB**      |
+
+**-57 % cumulative reduction since the start of this PDFDict
+storage-shape work.** Staging's chain skips the two intermediate
+shims and goes from fast-dict-array straight to fast-dict-onebuf;
+the heap drop on that direct hop is 92 → 66 MB (-28 %).
+
+### Caveats
+
+- **Single context.** Same singleton-PDFContext assumption that
+  fast-dict-double introduced: throws if a second PDFContext is
+  constructed in the process. Fine for our build pipeline (one
+  `PDFDocument.load` per build); a general-purpose variant would
+  need an array + small ctxIdx field.
+- **Single 24-bit start budget.** If `main` exceeds 16 M slots
+  (8 M entries) the next pack() throws. The book's `main` peaks
+  at ~2.4 M slots; 6x headroom.
+- **COW on delete.** Always. Cheap for small dicts; could be slow
+  for huge dicts with frequent deletes. Not a pattern we see.
+- **Live `main` is bigger than the prior approach's transient
+  allocations.** GC mark phase pays for that. The tradeoff -- less
+  *allocation* (heap throughput) but slightly more *live* (mark
+  cost) -- shows in the modestly higher GC time. Profile both
+  signals when evaluating.
+
+### Shipped
+
+`docs/render-book.mjs` imports
+[`./lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs)
+in place of the prior `./lib/fast-dict-array.mjs`. fast-dict-array
+stays in the tree as an A/B baseline; the `--fast-dict-onebuf`
+mutex in `measure.mjs` rejects combining either with the other
+dict-shape shims.
+
+## Two-pass measure-allocate-work: Phase 0 viability gate
+
+After fast-dict-onebuf, GC self-time settled at ~150 ms / 15 % of
+the process phase. V8-flag knobs (`--max-semi-space-size`,
+`--max-old-space-size`, `--no-incremental-marking`,
+`--gc-interval=-1`) didn't move it -- mark cost is dominated by
+walking the live set, not by allocation rate. The remaining
+attack surface is **shrink the live set V8 has to mark**, ideally
+by representing dict slots as Numbers (a Float64Array mainBuf)
+rather than Object references.
+
+That option needs an encoding scheme for every value type that
+can live in a dict slot. Names, refs, numbers, and nested dicts
+are already pooled or naturally Number-encodable. PDFArray,
+PDFString, and PDFHexString are not pooled today, so they'd need
+a side `Object[]` fallback -- which V8 still marks. The fallback
+would shrink mark cost in proportion to how many slots are
+pooled, but not eliminate it.
+
+The cleaner version sidesteps the encoding-headroom question
+entirely by **measuring before allocating**:
+
+1. **Measure pass** -- walk the bytes as a state machine, no
+   PDFObject instantiation. Produce only counts and small
+   interning tables (Map<name, id>, dense ref array).
+2. **Allocate pass** -- every pool sized exactly: mainBuf as
+   `Float64Array(exact_slot_count)`, name/ref/number pools as
+   exact-sized arrays, string buffer as one exact-sized
+   `Uint8Array`. No growth, no slack.
+3. **Work pass** -- re-parse, this time encoding each value as a
+   pool-index Number into mainBuf. Every pool's size is known so
+   the encoding scheme is trivial (3 bits of type tag + N bits
+   of pool index, all fitting comfortably in 53 bits). All of
+   mainBuf is Float64; V8 marks nothing in it.
+
+The catch: a second parse is more CPU. Today's load is ~1.2 s on
+the 39 MB Chrome input; if measure-pass were 600 ms we'd regress
+on CPU even if GC dropped to zero. Phase 0 is a viability gate:
+implement the no-allocate measure pass, time it, decide whether
+the architecture is worth the engineering surface.
+
+### The walker
+
+[`perf/phase0-measure.mjs`](../phase0-measure.mjs) is a
+no-allocate byte walker that recognises the PDF grammar:
+indirect-object headers, dicts (`<< ... >>`), arrays
+(`[ ... ]`), names (`/foo`), strings (`(...)`), hex strings
+(`<...>`), numbers (integer and real, with or without a leading
+integer part), refs (`X Y R`), streams (detected as `dict`
+followed by `stream` keyword), and ObjStms (detected via
+`/Type /ObjStm` and inflated to recurse).
+
+Allocation discipline:
+
+- No string concat anywhere. Names, numbers, and strings are
+  skipped by advancing the byte cursor without keeping bytes.
+- Counters and per-frame dict captures live on typed-array
+  stacks (`Int32Array`, `Uint8Array`), depth-indexed to a max
+  of 64 (observed max recursion is 4).
+- ObjStm offset arrays are reusable `Int32Array(512)` instances,
+  grown on demand. The inflate destination is a fresh Buffer
+  per ObjStm (Chrome's raw output has zero ObjStms anyway; book.pdf
+  has 453 of them after pdf-lib's save bundles them).
+- Per-dict capture stack stores `/Length`, `/Type` (matched
+  against `ObjStm`), `/N`, `/First` -- enough to detect streams
+  and seek through them without a fallback scan in the common case.
+  Key disambiguation is inline byte comparison against the four
+  known stream-related names; everything else falls through to
+  unconditional name-body skip.
+
+### Two corners worth remembering
+
+- **PDF reals can omit the integer part.** `.251` is a valid
+  number; the first cut required `>= 1` integer digit and threw
+  on `<</CA .251 ...>>` (Chrome emits `/CA` and `/ca` alpha
+  values this way). Fix: accept `[sign?][digits?][. [digits?]]?`
+  with the constraint that at least one digit (int OR frac)
+  appears. pdf-lib's `parseRawNumber` handles this natively;
+  custom byte walkers have to remember.
+- **fast-dict-onebuf is singleton-context.** A second
+  `PDFDocument.load` in the same process throws. The Phase 0
+  comparison runs measure-pass N times (independent) but the
+  pdf-lib load only once.
+
+### Measured cost
+
+Input: `perf/raw.pdf` (39.3 MB, Chrome's raw output for the book,
+saved via the new `--dump-raw-pdf` flag below).
+
+| Pass                           | Time              | Notes                                |
+|--------------------------------|------------------:|--------------------------------------|
+| Measure pass (min of 5)        |          **135 ms** | runs were 135 / 143 / 147 / 152 / 156 |
+| `PDFDocument.load` (1 run)     |         **1238 ms** | production shim set imported         |
+| **ratio measure / load**       |        **0.109**  | ~9x cheaper                          |
+
+Throughput cross-check: book.pdf is 15.3 MB but the measure pass
+inflates 23.2 MB of ObjStm content, so effective bytes walked is
+~38.5 MB. raw.pdf walks 39.3 MB. Both clock ~290 MB/sec; the
+work-per-byte is consistent across two very different physical
+layouts.
+
+### What the counts unlock
+
+Per-run summary (raw.pdf, last run):
+
+```
+  indirect objects:  226 417
+  dicts:             260 966   slots: 2 340 522   max single: 8 706
+  arrays:             81 191   slots:   495 639   max single: 25 308
+  refs (appearances):       749 779
+  names (appearances):    1 679 151
+  numbers (appearances):    284 104
+  strings (literal/hex):    7 375 / 0
+  streams:                    2 061   ~11 MB content
+  objstms:                        0
+  max recursion depth:            4
+```
+
+Direct consequences for Phase 1+:
+
+- `mainBuf` would be `Float64Array(2 340 522 + slack)` -- a hard
+  upper bound, no growth ever.
+- Array-side mainBuf would be `Float64Array(495 639 + slack)`.
+- Recursion stack peaks at 4; no need to overallocate the temp.
+- Single largest dict is 8 706 slots, single largest array is
+  25 308 slots -- both well below the 14-bit length field
+  fast-dict-onebuf already uses.
+
+Three caveats on the counts:
+
+- **Appearance counts, not unique.** 1.68 M name appearances
+  resolve to a few thousand unique strings after interning. The
+  measure pass needs an interning Map<string, id> for names
+  (and similar for refs) to produce the *unique* pool sizes
+  needed for exact allocation. That's a Phase 1 addition --
+  cheap to add, will slightly raise measure-pass cost.
+- **Counts are physical-layout-independent.** raw.pdf has
+  226 k flat indirect objects and zero ObjStms; book.pdf has
+  2.5 k indirect objects of which 453 are ObjStms bundling 226 k
+  dicts. The *dict* count is identical (~261 k) either way.
+  This is the right invariant: pool sizing tracks the logical
+  document, not Chrome's vs pdf-lib's packing decision.
+- **Stream-length capture is fast-path-only.** When `/Length`
+  is a direct integer (the common case) we seek by it. When it's
+  a ref (`/Length 5 0 R`) we fall back to scanning for
+  `endstream`. We don't currently count fallbacks; would need to
+  add a counter if it ever looks like a non-trivial fraction.
+
+### Decision
+
+Architecture cleared. Measure-pass at ~11 % of load leaves
+plenty of headroom: even if the work pass came out at 80 % of
+current load (~990 ms) we'd land at 135 + 990 = 1 125 ms vs the
+current 1 238 ms -- net win on CPU before any GC reduction. The
+Float64Array mainBuf in the work pass should compound on top of
+that.
+
+### Wiring
+
+- **[`perf/measure.mjs`](../measure.mjs)** gains a `--dump-raw-pdf
+  <path>` flag. When set, the harness writes the raw Chrome
+  output (the input to pdf-lib's load) to the given path right
+  after `page.pdf()` returns. Used once to capture the canonical
+  input; not part of any routine run.
+- **`perf/raw.pdf`** (gitignored) is the canonical 39.3 MB
+  Chrome-output PDF, captured with the production shim set and
+  the new flag. The reference input for measure / heap-profile
+  investigations going forward.
+- **[`perf/phase0-measure.mjs`](../phase0-measure.mjs)** is the
+  prototype walker. Takes a PDF path and `--runs N`, runs the
+  measure pass N times, then runs `PDFDocument.load` once
+  (singleton-context), prints counts and the measure / load
+  ratio. Defaults to the most recent `perf/results/*/book.pdf`
+  if no path is given.
+
+Run it via:
+
+```
+node perf/phase0-measure.mjs perf/raw.pdf --runs 5
+```
+
+The prototype is measurement-only -- it doesn't ship in any
+production path. Phase 1 (next section) wires the measure-pass
+into production by using the dict-slot count to pre-size
+fast-dict-onebuf's mainBuf in place.
+
+## Phase 1: pre-size mainBuf via measure-pass
+
+The narrow first step of the two-pass architecture. Productionises
+Phase 0's walker, exposes a `setExpectedDictSlots()` hook on
+fast-dict-onebuf, and wires the two together. Replaces
+`new Array(MAIN_INITIAL_CAP = 2_400_000)` with
+`new Array(measuredDictSlots)` -- exact, no slack, no V8 growth.
+
+This is plumbing, not a perf win. The mainBuf savings are
+trivial (~60 K slots of slack on a 2.34 M-slot backing store)
+and the measure pass itself costs ~60 ms inline. Net cost on
+the book is ~40 ms (the measure-pass time minus run-to-run
+noise on load). What Phase 1 buys is **landing the two-pass
+pipeline byte-identical** so a future Phase 2 (Float64Array
+mainBuf) can convert the storage type without re-doing the
+plumbing.
+
+### The shim
+
+- [`docs/lib/measure-pass.mjs`](../../docs/lib/measure-pass.mjs)
+  -- a direct port of the Phase 0 `Measurer` class as a
+  production library. Exports the class and a
+  `measure(bytes) -> counts` convenience wrapper. No
+  dependencies on any `fast-*` shim or on pdf-lib itself; it's
+  a stand-alone byte walker.
+- [`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs)
+  -- gains `setExpectedDictSlots(slots, slack = 1.0)`. Resizes
+  the module-level `main` in place via
+  `main.length = ceil(slots * slack)`. Throws if called after
+  `mainLen > 0` (i.e. after any dict has been committed). Used
+  by the measure-pass wiring; harmless to ignore.
+- [`perf/measure.mjs`](../measure.mjs) `--measure-pass` --
+  runs the walker on rawPdf, calls
+  `setExpectedDictSlots(counts.dictSlots)`, then proceeds to
+  `PDFDocument.load`. Mutex-checked against `--incremental`,
+  `--render-only`, and the (required) `--fast-dict-onebuf`.
+
+### A V8 IC-invalidation gotcha (worth the diversion)
+
+First implementation reassigned the module binding:
+
+```js
+let main = new Array(MAIN_INITIAL_CAP);  // module load
+// ...
+export function setExpectedDictSlots(slots) {
+  main = new Array(slots);                // setter
+}
+```
+
+JS closures see the current binding value -- the reassignment
+*works correctly* in the language sense, and structural validation
+passes. But the heap profile showed `_appendEntries` jumping from
+below-threshold (~430 KB) to **27 MB / 29 %** of total samples,
+with sampled heap going **65 → 92 MB (+27 MB)**.
+
+Hypothesis trail:
+- First guess: HOLEY_SMI_ELEMENTS → HOLEY_ELEMENTS transition on
+  first Object-pointer write, reallocating the ~18 MB backing
+  store. Pre-filling with `arr.fill(null)` to force the transition
+  at allocation time -- *no change*.
+- Second guess: V8's inline caches in `_appendEntries`,
+  `PDFDict.prototype.get`, etc. specialised for the original
+  `main` object (its hidden class, element kind, address).
+  Rebinding `main` to a fresh Array makes the IC slots stale;
+  every call deopts, recompiles, and accumulates allocation
+  overhead attributed to the running frame.
+
+Fix: keep the same Array identity, just resize.
+
+```js
+const main = new Array(MAIN_INITIAL_CAP);  // module load, back to const
+export function setExpectedDictSlots(slots) {
+  main.length = slots;                     // in-place resize
+}
+```
+
+That collapses the regression to noise (+0.14 MB heap, ~0 ms
+CPU). Lesson: **never rebind a module-level value that hot
+closures specialise against, even if the language semantics
+allow it.** Mutate in place.
+
+### Validation: byte-identical output
+
+Two full-pipeline runs through the production shim set, one
+with `--measure-pass` and one without. Both produce a 1 651-page,
+1 773-outline-node, "twinBASIC Documentation"-titled PDF; bytes
+differ by 31 due to Chrome's per-run rawPdf timestamps, which
+propagate through `pdfDoc.save`. Structural identity confirmed.
+
+| Field                | baseline           | with measure-pass  |
+|----------------------|--------------------|--------------------|
+| pages                | 1 651              | 1 651              |
+| outline nodes        | 1 773              | 1 773              |
+| title                | "twinBASIC Documentation" | "twinBASIC Documentation" |
+| bytes                | 16 077 319         | 16 077 288         |
+
+### Measured cost (after the in-place-resize fix)
+
+Paired runs, production shim set, on the book (39 MB rawPdf):
+
+| Phase             | Without measure-pass | With measure-pass | Delta |
+|-------------------|---------------------:|------------------:|------:|
+| measure-pass      | -                    | 60 ms             | +60   |
+| load              | 520 ms               | 500 ms            | -20   |
+| save              | 420 ms               | 420 ms            |   0   |
+| **process total** | **950 ms**           | **990 ms**        | **+40** |
+
+The 60 ms inline-measure number is faster than the 135 ms
+standalone Phase 0 number, almost certainly because rawPdf is
+still hot in CPU caches from `page.pdf()`. Standalone phase0-
+measure.mjs reads it cold from disk into a Buffer first.
+
+The -20 ms on load is within run-to-run noise on this machine.
+The honest summary: Phase 1 adds the cost of the measure pass
+itself (~60 ms) and not much else.
+
+### Measured heap
+
+Paired heap-profile runs (`--heap-profile-process --heap-sampling
+512`), top frames:
+
+| Frame                                | Baseline (KB) | With measure (KB) | Delta |
+|--------------------------------------|--------------:|------------------:|------:|
+| `PDFObjectParser.parseArray`         |     19 583.67 |         19 435.74 | flat  |
+| `_makeFromRange`                     |     16 510.94 |         16 657.94 | flat  |
+| `parseIndirectObjectHeader`          |     13 510.65 |         13 558.62 | flat  |
+| `fastOf`                             |      7 695.92 |          7 817.85 | flat  |
+| `parseIndirectObjectSync`            |      2 101.19 |          2 102.32 | flat  |
+| `_appendEntries` (post-fix)          |          ~430 |              ~430 | flat  |
+| **total sampled**                    |  **65.27 MB** |      **65.41 MB** | **+0.14 MB** |
+
+Flat as expected. Phase 1 doesn't change what gets allocated --
+only the initial capacity of the backing Array, which is a
+one-time module-load-time cost that the process-phase profile
+doesn't see.
+
+### Caveats
+
+- **Requires --fast-dict-onebuf.** The only shim that consumes
+  `setExpectedDictSlots` so far. The mutex check enforces this.
+- **Singleton context inherited.** Phase 1 doesn't loosen
+  fast-dict-onebuf's "one PDFContext per process" constraint --
+  same throw-on-second-load behaviour.
+- **Pre-sizing assumes the measure and load see the same bytes.**
+  Always true for our pipeline (rawPdf is computed once, both
+  measure and load read it). Would break if the bytes mutated
+  between measure and load -- not a pattern we have.
+- **Counts are appearances, not unique.** Phase 1 only needs
+  dict-slot count, which is an appearance count (every slot is
+  one). Any later phase 2+ pool sizing would need unique counts
+  and would add interning to the walker.
+
+### Where this lands
+
+`--measure-pass` ships behind a harness flag at first, then gets
+wired into [`docs/render-book.mjs`](../../docs/render-book.mjs)'s
+production import chain in a subsequent commit (the "enable
+Phase 1 measure-pass in production" change). The decision to
+ship it was bounded: it's the smallest of the four Phases we
+evaluated and the only one whose tradeoff is acceptable for
+production. Phase 2 is a net regression on its own; Phase 3 /
+3β recover most of it for a ~7 MB heap win that doesn't justify
+the CPU cost. Phase 1's bound on mainBuf isn't material on its
+own (~60 K slots out of 2.4 M of slack), but it lays the
+plumbing for any future shape change to ship without re-doing
+the wiring.
+
+[`docs/lib/measure-pass.mjs`](../../docs/lib/measure-pass.mjs)
+ships as a library (the production home of the walker).
+`perf/phase0-measure.mjs` is left alone -- it's the historical
+record of the viability gate, intentionally self-contained even
+though it now duplicates the walker.
+
+## Dropping the owned bit (post-Phase-1 cleanup)
+
+The One-buffer PDFDict layout above carried an `owned` flag at
+bit 38, distinguishing parser-created ("shared") ranges from
+factory-created ("owned") ones. Its only behavioural effect was
+gating the `set` append path: a dict was allowed to extend in
+place at the high-water mark only if `owned`.
+
+Re-reading the safety argument: each parseDict commits a
+contiguous frame to main and mainLen advances past it. No two
+PDFDict instances share slots. So if a dict's range satisfies
+`start + length === mainLen`, nothing past mainLen is initialised
+and the slots are free to claim -- *regardless* of whether the
+range came from the parser or a factory call. The owned/shared
+distinction doesn't correspond to anything the safety check
+needs.
+
+Dropping it:
+
+- `pack(start, length)` -- third arg gone, no OR-in of `POW_38`.
+- `_owned`, `POW_38` -- deleted.
+- `_cow` -- collapses to one branch (was two identical-except-
+  for-the-HWM-early-return paths).
+- `set` -- the gating condition simplifies from
+  `!_owned(d0) || start0 + length0 !== mainLen` to just
+  `start0 + length0 !== mainLen`.
+- `_makeFromRange(ProtoClass, start, length, ctx)` -- owned param
+  gone; `_ownedFromArray` renamed `_makeFromAppend` for accuracy.
+- Bit 38 is now spare; spare grows from 14 to 15 bits.
+
+Net behavioural change: shared dicts that still abut the HWM at
+first `set` now extend in place instead of COWing, saving ~5-10
+slot copies per such mutation. Tiny win, but in the right
+direction.
+
+Validated byte-identical on both the no-measure-pass path and
+the `--measure-pass` path; structural diff (1 651 pages, 1 773
+outline nodes, matching titles) holds. Heap is flat as expected
+-- this is a code simplification, not an allocation-pattern
+change.
+
+## Slot-type histogram for mainBuf
+
+The next attack surface on GC self-time -- the ~150 ms left after
+fast-dict-onebuf -- is converting `main` from `Array` (Object
+references that V8 must mark) to `Float64Array` (Number slots
+that V8 ignores during mark). That only works if every slot
+value can be encoded as a Number, or pooled into a side table
+where the marker count is small.
+
+To scope that work, [`perf/instrument-slot-types.mjs`](../instrument-slot-types.mjs)
+walks `main[0..mainLen)` after setOutline and classifies each
+slot by PDFObject subtype. The instrumentation hangs off two new
+exports on fast-dict-onebuf (the `main` Array itself and a
+`getMainLen()` getter) and runs behind a new
+`--instrument-slot-types` flag on `measure.mjs` that requires
+`--fast-dict-onebuf` and skips the incremental / render-only
+paths.
+
+Distribution on the book (production shim set + `--measure-pass`,
+total slots = 2 358 630, keys = 1 179 315, values = 1 179 315):
+
+```
+type           keys      key%       values    value%   total%
+-----------------------------------------------------------------
+PDFName        1179315   100.00%    493256    41.83%   70.91%
+PDFRef               0     0.00%    435217    36.90%   18.45%
+PDFNumber            0     0.00%    162325    13.76%    6.88%
+PDFArray             0     0.00%     79468     6.74%    3.37%
+PDFDict              0     0.00%      5660     0.48%    0.24%
+PDFHexString         0     0.00%      1776     0.15%    0.08%
+PDFString            0     0.00%      1601     0.14%    0.07%
+PDFBool.True         0     0.00%        12     0.00%   0.0005%
+PDFBool.False        0     0.00%         0     0.00%        0
+PDFNull              0     0.00%         0     0.00%        0
+```
+
+Key findings:
+
+1. **Keys are 100 % PDFName** -- the even/odd invariant the
+   parser maintains holds. Encoding keys as the name's pool
+   index is unambiguous.
+2. **Four big pools (Name, Ref, Number, Dict) cover 96.4 % of
+   all slots.** Encoding them directly as Numbers in a
+   Float64 mainBuf collapses ~96 % of slot-mark traversals.
+3. **Side-pool fallback for unpooled types (Array, String,
+   HexString) is ~3.5 %** -- ~82 800 slots that V8 would
+   still mark via the side `Object[]`, vs ~2.34 M today.
+4. **Nested PDFDicts as slot values are only 5 660** -- most
+   dicts are referenced via PDFRef rather than embedded inline.
+5. **Bool / Null / RawStream in dict slots are essentially zero**
+   -- tag-only encoding (a few reserved sentinel Numbers)
+   covers them.
+
+Classification cost: 39 ms (single pass over 2.36 M slots).
+
+This shape is informative even though it doesn't itself ship a
+change. The subsequent Phase 2 / Phase 3 prototypes (next two
+sections) use these numbers to predict their wins; both turn out
+not to ship for reasons documented there.
+
+## Phase 2: Float64Array mainBuf + encoded slots (explored, didn't ship)
+
+The next architectural step from Phase 1. `main` becomes a
+`Float64Array`; every entry (key and value alike) is encoded as
+a 4-bit type tag + 49-bit pool id / payload packed into a single
+Float64. The hypothesis was that V8 would stop marking the 2.34 M
+Object-ref slots in `main` during GC, dropping mark-phase cost.
+
+Prototyped as `fast-dict-encoded.mjs`. Outcome: **wash.** The
+slot-mark-cost win is real (mainBuf's 2.34 M Object-ref slots →
+Float64 slots → V8 marks zero of them) but the cost wasn't large
+enough to matter -- pointer-array marks are fast in V8. The
+encoding overhead (per-slot encode at parse, per-slot decode at
+save) roughly cancels the savings; heap goes up ~3 MB from the
+new pool Maps (numberByValue, stringByValue, hexByValue,
+refGnByKey). The code was kept in faraday as opt-in (foundation
+for Phase 3) but is not pulled into staging; the design rationale
+below is the takeaway worth preserving.
+
+### Encoding scheme
+
+```
+Float64 slot (within Number.MAX_SAFE_INTEGER = 2^53 - 1):
+  bits 49-52  : type tag (4 bits, 16 possible, 11 used)
+  bits  0-48  : payload (49 bits)
+
+Tags:
+  0   PDFNull       (payload = 0)
+  1   PDFBool.False (payload = 0)
+  2   PDFBool.True  (payload = 0)
+  3   PDFName       (payload = name pool id)
+  4   PDFRef gen=0  (payload = objectNumber)
+  5   PDFRef gen!=0 (payload = side pool id)
+  6   PDFNumber     (payload = number pool id)
+  7   PDFDict       (payload = packed (start, length) -- the
+                    existing 38-bit fast-dict-onebuf encoding)
+  8   PDFArray      (payload = array pool id)
+  9   PDFString     (payload = string pool id, value-dedup)
+  10  PDFHexString  (payload = hex pool id, value-dedup)
+  11-15  reserved
+```
+
+### Pool subsumption
+
+The shim absorbs three existing pool shims under one umbrella:
+
+- `PDFRef.of` -- patched to assign `_encId` to each instance;
+  gen=0 uses `objectNumber` as id (dense `refByObjNum[]`); gen!=0
+  uses a sequential side-pool. Would subsume **`--fast-refs`**.
+- `PDFNumber.of` -- patched to assign `_encId`; value-dedup via
+  `numberByValue` Map + parallel `numberById[]`. Would subsume
+  **`--fast-pdfnumber-pool`**.
+- `PDFName.of` -- pdf-lib already pools by string; extended
+  with `_encId` assignment + `nameById[]` for decode.
+- `PDFArray`, `PDFString`, `PDFHexString` -- new pools (none
+  existed). `PDFArray` is mutable so no value-dedup, just
+  sequential id. Strings/HexStrings are immutable so dedup by
+  `value`.
+
+Mutually exclusive with `--fast-dict-onebuf`, `--fast-refs`,
+`--fast-pdfnumber-pool`, and the older dict-shape shims.
+
+### A trap worth recording: eager dictByPayload caching
+
+The first cut of `_makeFromRange` registered every parse-created
+PDFDict in a `dictByPayload` Map so `decodeValue(TAG_DICT)` would
+return the same instance. That writes 261 k Map entries during
+parse -- `set @ (no url):0` shot to **15.4 MB / 29 %** of the
+heap profile, and total sampled heap went 65 → 92 MB (+27 MB).
+
+The fix is the same kind of insight as the lazy materialization
+pattern that surfaced earlier: top-level dicts (226 k) live in
+`PDFContext.indirectObjects` and are never decoded via
+`TAG_DICT` (their entries are in main, but they themselves
+aren't slot values). Only nested dicts (~5 660) are accessed via
+`TAG_DICT` decode. Caching them lazily on first access caps
+`dictByPayload` at ~5 660 entries (~360 KB) and collapses the
+regression. Same shape of bug as the IC-invalidation gotcha in
+Phase 1: a plausible-looking eager cache landed an enormous heap
+regression that only made sense once you saw which population
+was actually being decoded vs only being written.
+
+### Mixed measured result
+
+| Metric | Phase 1 | Phase 2 | Delta |
+|---|---:|---:|---:|
+| Process wall (clean run) | 1.16 s | 1.18 s | ~+20 ms (noise) |
+| GC self-time (CPU profile) | 151 ms | 149 ms | ~0 ms |
+| GC total (`--trace-gc` full process) | 190 ms | 159 ms | -31 ms |
+| Mark-Compact events | 8 | 10 | +2 |
+| Scavenge events | 26 | 26 | 0 |
+| Heap allocation sampled | 65.4 MB | 68.5 MB | **+3 MB** |
+| Live mainBuf slots V8 marks | ~2.34 M | ~0 (Float64Array) | -100 % |
+| Structural output | byte-identical | byte-identical | -- |
+
+**Phase 2 is a wash.** The encoding overhead roughly cancels the
+mark-phase savings, and the new pool Maps cost more than the
+slot-mark reduction is worth.
+
+The first CPU profile of P2 showed +39 ms GC and +130 ms wall,
+but reruns landed it back near Phase 1. The original numbers were
+single-run noise (slow Scavenge cluster on a busy machine).
+
+### Why faraday kept it as opt-in, and why staging doesn't
+
+Two reasons faraday left it in tree:
+
+1. **Pool ID infrastructure is reusable.** Phase 3 (PDFArray
+   storage refactor) uses the same encoding scheme, same pools,
+   same `encodeValue` / `decodeValue` -- it piggybacks on
+   Phase 2 for free.
+2. **Validates the architecture.** Float64Array mainBuf works,
+   byte-identical, no correctness issues. If a future workload
+   stresses mainBuf mark cost more, Phase 2 would be ready.
+
+Phase 3 also doesn't ship (next section), so the dependency
+chain doesn't earn its keep on staging. Dropping Phase 2's code
+keeps the production import chain narrow; the design notes here
+are the part worth preserving.
+
+## Phase 3: PDFArray storage refactor (explored, didn't ship)
+
+Phase 2's `fast-dict-encoded.mjs` grew a sibling structure for
+PDFArray. Each PDFArray instance becomes a view into a shared
+`arrayBuf` Float64Array, with `this.d` packing `(start, length)`
+-- same shape as PDFDict in Phase 2, with one more length bit
+(max single array is 25 308 elements vs max single dict 8 706
+slots). Per-instance `this.array = []` allocation goes away.
+
+Same opt-in story as Phase 2 (and same don't-ship verdict on
+staging): heap win is real but the CPU regression at save time
+dominates.
+
+### The mechanism
+
+| | PDFDict (Phase 2) | PDFArray (Phase 3) |
+|---|---|---|
+| Backing buffer | `main` Float64Array | `arrayBuf` Float64Array |
+| Per-instance | `this.d` = packed `(start, length)` | same |
+| Bit budget | 24 + 14 = 38 bits | 24 + 15 = 39 bits |
+| Slot encoding | 4-bit tag + 49-bit payload | same scheme |
+| Lazy cache | `dictByPayload` | `arrayByPayload` |
+| Parser temp | `_dictTemp` (Float64Array) | `_arrayTemp` (Float64Array) |
+| TAG_ARRAY slot | was `OFF_ARRAY + arrayId` | now `OFF_ARRAY + arr.d` |
+
+Phase 2's `_assignArrayId` and `arrayById[]` pool are gone -- the
+view-payload encoding makes them obsolete. Phase 2's encoding
+scheme for TAG_ARRAY changes from a pool-id payload to the
+direct `(start, length)` payload that mirrors TAG_DICT.
+
+### Mutation paths
+
+`PDFArray.prototype` methods rewritten:
+
+- `size` -- reads length from `this.d`
+- `push` -- extend in place at HWM, else COW (same pattern as
+  PDFDict.set's append case)
+- `get(i)` / `set(i, v)` -- decode/encode at `arrayBuf[start + i]`
+- `insert(i, v)` / `remove(i)` -- always COW (would corrupt
+  neighbouring arrays' ranges otherwise)
+- `indexOf` -- compare encoded payloads, no decode needed
+- `asArray` / `clone` / `toString` / `sizeInBytes` /
+  `copyBytesInto` -- decode each element
+
+`PDFArray.withContext` bypasses the inherited constructor's
+`this.array = []` allocation by `Object.create`-ing the
+instance and setting `this.d` directly.
+
+### parseArray patch
+
+Same temp-then-commit pattern as parseDict. Each parser instance
+gets its own `_arrayTemp` Float64Array; parseArray pushes
+encoded elements onto temp, commits the frame to `arrayBuf` in
+one contiguous `arrayBuf.set(...)`, pops temp back. Recursion
+across dicts and arrays is fine because `_dictTemp` and
+`_arrayTemp` are separate.
+
+### Measured result: heap win + CPU regression
+
+Combined Phase 2+3 vs Phase 1 baseline (paired, production set):
+
+| Metric | Phase 1 baseline | Phase 2+3 | Delta |
+|---|---:|---:|---:|
+| Heap sampled | 65.4 MB | **57.8 MB** | **-7.6 MB (-12 %)** |
+| `parseArray` self-attribution | 19.6 MB | ~0 (out of top 10) | **-19.6 MB**, replaced by arrayBuf-mediated writes |
+| `_makeFromRange` | 16.5 MB | 14.3 MB | -2.2 MB |
+| GC self-time (CPU profile) | 149 ms | 144 ms | -5 ms (flat) |
+| Process duration | 1.09 s | 1.45 s | **+360 ms (+33 %)** |
+| Structural output | byte-identical | byte-identical | -- |
+
+The heap win is what we hoped for: PDFArrays stop allocating
+per-instance `[]` backing arrays (79 k of them), and parseArray
+stops attribution because writes go to the shared `arrayBuf`.
+
+The CPU regression is the killer. The cost comes from per-slot
+decode during save -- `PDFDict.copyBytesInto` and
+`PDFArray.copyBytesInto` together iterate ~3 M slots, calling
+`decodeValue` once per slot. `decodeValue` is a 10-case switch
+plus a pool lookup; V8 doesn't inline it across the prototype
+boundary. ~100 ns per call × 3 M = ~300 ms. GC didn't move
+much. The slot-mark savings from Float64Array `arrayBuf` are
+real, but as with Phase 2 they're small relative to total mark
+cost. V8 marks pointer arrays fast.
+
+### Why faraday kept it as opt-in, and why staging doesn't
+
+Phase 3 validates the architecture for both data structures
+(Float64Array storage works for dicts AND arrays, byte-identical,
+no correctness issues) and the heap win is real (-7.6 MB / -12 %
+is not nothing). It also sets up an obvious follow-up:
+hand-inline the common decode cases at the hot copyBytesInto /
+sizeInBytes call sites. That's Phase 3β below -- which recovers
+much of the 300 ms but the net win still doesn't justify the
+engineering surface for our pipeline, so the whole encoded
+architecture stays off staging.
+
+### Caveats / known limitations
+
+- Direct `new PDFArray(context)` (rather than the
+  `PDFArray.withContext` factory) would leave `this.d` undefined
+  and methods would misbehave. pdf-lib's parser and our
+  setOutline go through the factory, but a hypothetical caller
+  using `new` would need the factory or a defensive init guard.
+- `PDFArray.scalePDFNumbers` (in pdf-lib's PDFArray; not
+  rewritten here) goes through `get`/`set` and so would work
+  transparently via the encoded path. Not exercised in the book
+  build.
+- PDFArrays nested in PDFArrays via `TAG_ARRAY` decode lazily,
+  same pattern as nested dicts; `arrayByPayload` caps at the
+  number of distinct nested-array payloads (small).
+
+## Phase 3β: hand-inline decodeValue at the save hot path (explored, didn't ship)
+
+The Phase 3 CPU regression was almost entirely per-slot decode
+during save -- `PDFDict.copyBytesInto`, `PDFDict.sizeInBytes`,
+`PDFArray.copyBytesInto`, `PDFArray.sizeInBytes` together
+iterate ~3 M slots, each calling `decodeValue` (10-case switch
++ pool lookup). V8 doesn't inline the function across the
+prototype-method boundary; ~100 ns × 3 M ≈ +300 ms.
+
+Phase 3β hand-inlines `decodeValue`'s switch into all four hot
+methods. The switch body is copy-pasted verbatim into each
+loop, giving V8 a monomorphic `.copyBytesInto` /
+`.sizeInBytes` call site per case branch.
+
+### Measured
+
+| Frame | P1 baseline | P3 (pre-inline) | **P3β** | β vs P1 |
+|---|---:|---:|---:|---:|
+| `(garbage collector)` | 149 ms | 144 ms | **130 ms** | **-19 ms (win)** |
+| `PDFObjectParser.parseName` | 87 ms | 106 ms | **70 ms** | **-17 ms (win)** |
+| `fastParseDict*` | 40 ms | 59 ms | 63 ms | +23 ms (encode at parse) |
+| `PDFDict.copyBytesInto` | 27 ms | 57 ms | **49 ms** | +22 ms |
+| `PDFDict.sizeInBytes` | (<top15) | (<top15) | 33 ms | new |
+| Heap sampled | 65.4 MB | 57.8 MB | **58.0 MB** | **-7.4 MB (win)** |
+| Structural | byte-identical | byte-identical | byte-identical | -- |
+
+The wins (GC -19 ms, parseName -17 ms) are real. parseName's
+drop is surprising but consistent across reruns -- the
+inlined switch made some call sites monomorphic that weren't
+before, and V8 re-optimized parseName as a downstream effect.
+
+The losses (encode-at-parse +23 ms, copyBytesInto +22 ms,
+sizeInBytes +33 ms) come from the inlined 11-case switch
+itself. Each iteration in the hot loop pays for the tag
+dispatch.
+
+### Architectural conclusion (Phase 2 + 3 + β closeout)
+
+Float64Array encoded storage **does work** -- byte-identical
+output, mainBuf and arrayBuf mark cost goes to zero, ~7.4 MB
+heap saved, GC drops ~20 ms. But it doesn't pull its weight
+on this workload because:
+
+1. **V8 marks pointer arrays fast.** mainBuf's 2.34 M
+   Object[] slots cost ~10-20 ms of mark time, not the 100+ ms
+   we assumed. The slot-mark savings are real but small.
+2. **The encoding scheme adds per-slot work that exceeds the
+   savings.** Encode at parse + decode at save = ~50 ms net
+   loss in the hot loops, even with hand-inlining.
+3. **The original polymorphic `main[i].copyBytesInto()` was
+   actually fine.** V8's megamorphic IC handled it well.
+   Replacing with explicit switch + monomorphic per-case
+   dispatch *helps slightly* in GC and parseName but
+   *hurts in dict hot paths*.
+
+The work isn't wasted -- the design notes here quantify *why*
+this approach isn't the right lever, and the pool ID
+infrastructure could be reused if a future optimization needs
+cross-type instance lookup. If a future workload stresses
+mainBuf mark cost more (much larger documents, more aggressive
+GC pressure, or a different V8 version) the encoded path is a
+known-correct starting point.
+
+Production stays on:
+
+- `--fast-dict-onebuf` (Object[] mainBuf with packed view)
+- `--fast-refs`, `--fast-pdfnumber-pool` (the pool shims that
+  fast-dict-encoded would have subsumed)
+- All other shipped `--fast-*` shims unchanged
+
+The next move on the same theme is the much narrower
+"one-buffer for PDFArray" -- skip the encoded scheme entirely
+and just mirror fast-dict-onebuf's shape onto PDFArray, keeping
+the Object[] storage and inheriting the same low-overhead
+view-with-packed-payload trick. That's the fast-array-onebuf
+section below; it does ship.
+
+## One-buffer PDFArray
+
+Mirror of fast-dict-onebuf's strategy applied to PDFArray. Every
+committed element lives in a single append-only `arrayMain` JS
+Array, kept for the document's lifetime. Each PDFArray instance
+is a view via packed `(start, length)` in `d`. Per-instance
+`this.array = []` allocation goes away; ~79 k PDFArrays stop
+allocating per-instance backing arrays + grow doublings.
+
+Storage is a plain heterogeneous JS Array -- slots hold the
+original PDFObject references, reads are `arrayMain[start + i]`
+with no decode. This is the explored-but-didn't-ship Phase 3
+shape (PDFArray as a view into a shared backing) minus the
+Float64Array encoding: Phase 3 paid ~300 ms of `decodeValue`
+dispatch on save's `copyBytesInto` (~3 M slots × 10-case switch
++ pool lookup). The plain-reference shape skips that entirely
+and is what makes fast-array-onebuf cheap to ship.
+
+### Parser temp + commit
+
+Per-parser `_arrayTemp` + length cursor as a recursion stack,
+parallel to fast-dict-onebuf's `_dictTemp`. Each `parseArray`
+invocation pushes onto temp, commits its frame to `arrayMain`
+in one contiguous append, and pops temp back. Dict and array
+temps are independent so cross-recursion is fine.
+
+### Mutations
+
+- `set(i, v)` -- in-place replace at `arrayMain[start + i]`.
+  Safe for any array; no shifts.
+- `push(v)` -- in-place extend at HWM (`arrayMain.push(v)` +
+  length += 1) when `start + length === arrayMain.length`;
+  COW otherwise.
+- `insert(i, v)` / `remove(i)` -- always COW. Shifting slots
+  in `arrayMain` would corrupt other arrays' ranges.
+
+Same at-HWM safety logic as fast-dict-onebuf; no owned bit
+needed (`start + length === arrayMain.length` is sufficient).
+
+### Bit layout
+
+```
+bits  0-23: start  (24 bits, max 16 M slots)
+bits 24-39: length (16 bits, max 65 536 elements; max observed
+                    ~25 k on the book)
+```
+
+40 bits used, well within `Number.MAX_SAFE_INTEGER`. One more
+length bit than fast-dict-onebuf's 14-bit dict length, because
+arrays can be larger than dicts on this workload.
+
+### Singleton context (duplicated)
+
+Same singleton-PDFContext assumption as fast-dict-onebuf, but
+the ~10 lines of context-stash machinery are duplicated rather
+than shared, so each shim stays independently injectable. A
+caller can opt into one without the other; both are independent
+side-effecting imports.
+
+### Production wiring
+
+- [`docs/render-book.mjs`](../../docs/render-book.mjs) -- imports
+  `setExpectedArraySlots` alongside `setExpectedDictSlots`, calls
+  both after `measureRawPdf` returns and before `PDFDocument.load`.
+- [`perf/measure.mjs`](../measure.mjs) -- adds `--fast-array-onebuf`
+  flag. Composes with `--fast-dict-onebuf`; `--measure-pass` also
+  drives `setExpectedArraySlots` when the array shim is on.
+- The harness's `--fast-array-onebuf` is opt-in alongside the
+  production path, the same arrangement as `--fast-dict-onebuf`.
+
+### Measured wins
+
+Heap impact (process phase, 512 B sampling, paired runs vs the
+Phase 1 baseline that was the immediate predecessor of this
+shim):
+
+| Allocator                | P1 baseline | + fast-array-onebuf | Delta              |
+|--------------------------|------------:|--------------------:|-------------------:|
+| `parseArray`             |    19.6 MB  |             ~0 (off top 15) | **-19.6 MB**  |
+| new shim row (PDFArray wrappers) | -   |             4.2 MB   | +4.2 MB           |
+| Total sampled            |    65.6 MB  |            **51.9 MB**       | **-13.7 MB (-21 %)** |
+
+CPU impact (process wall, pinned 0x5500 / High, no profiler,
+3 paired runs each side):
+
+| State            | median | mean   |
+|------------------|-------:|-------:|
+| P1 only          | 1.07 s | 1.09 s |
+| P1 + this shim   | 1.02 s | 1.01 s |
+
+Mean shifts +0.08 s -- this shim slightly faster, well within
+noise on this machine.
+
+The CPU regression that showed up under
+`--cpu-profile-process` (paired with the encoded-storage
+prototype) was profiler-induced noise; the sampler's per-allocation
+bookkeeping interacts badly with this shape. Gone once we pin
+CPU and drop the sampler. Worth remembering: when the only
+signal saying "this is slower" is the profiler, run the same
+code without the profiler before accepting the verdict.
+
+### Cumulative arc (final)
+
+Heap, starting from the original Map-backed PDFDict:
+
+| State                             | Total sampled | Change vs prior |
+|-----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array)  |   152 MB      | -               |
+| fast-dict-array                   |    92 MB      | -60 MB          |
+| fast-dict-onebuf                  |    66 MB      | -26 MB          |
+| **fast-array-onebuf**             |    **52 MB**  | **-14 MB**      |
+
+**-66 % cumulative reduction in process-phase heap traffic.**
+The final state of this storage-shape work. The endpoint of
+the dict + array allocator refactors that this notes file has
+been chasing for the last ~22 sections.
+
+## Drop the per-instance `PDFRef.tag` string
+
+With `fast-array-onebuf` shipping, the process-phase sampling heap
+profile flipped to `PDFParser.parseIndirectObjectHeader` at 13.7 MB
+/ 25 % of total. Attribution chain (via
+`perf/find-heap-callers.mjs`):
+
+```
+parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+  → matchIndirectObjectHeader (try/catch wrapper)
+    → parseIndirectObjectHeader → fastOf
+```
+
+`skipJibberish` runs after every successful indirect object parse
+and speculatively calls `matchIndirectObjectHeader` to detect the
+next `N M obj` header. On valid PDFs the speculation always
+succeeds, so `fastOf` fires once per indirect-object boundary,
+populating the dense-array cache; the subsequent "real"
+`parseIndirectObject` is then a cache hit. V8 inlines `fastOf` at
+this call site (small + hot from speculation) so the attribution
+lands on the caller -- 13.7 MB of which was the tag-string churn
+(`objectNumber + ' 0 R'`): V8 builds 1-2 intermediate concat
+strings + the final ~25-35 B tag, ~150 k times.
+
+### Upstream
+
+`PDFRef` (`pdf-lib/.../objects/PDFRef.js`) caches the
+`<obj> <gen> R` string on each instance:
+
+```js
+function PDFRef(objectNumber, generationNumber) {
+  var _this = this;
+  ...
+  _this.tag = objectNumber + ' ' + generationNumber + ' R';
+}
+```
+
+so that `toString` / `sizeInBytes` / `copyBytesInto` can read it
+back -- the three prototype methods are then trivial (`this.tag`,
+`this.tag.length`, `copyStringIntoBuffer(this.tag, ...)`). The
+earlier `fast-refs` shim already constructs the gen=0 PDFRef via
+`Object.create(PDFRef.prototype)` + manual field init, so it
+populated `tag` itself to preserve those reads.
+
+### The shim
+
+Drop the field entirely. The three prototype methods compute their
+results from `objectNumber` / `generationNumber` directly:
+
+- `copyBytesInto`: writes digits straight into the output buffer
+  via a no-allocation `_writeUint` helper
+  (divide-and-write-backwards into the caller's buffer). No
+  `copyStringIntoBuffer` call.
+- `sizeInBytes`: returns `_digitCount(obj) + _digitCount(gen) + 3`
+  (the trailing 3 covers " " + " R"). `_digitCount` is a ladder
+  catching the common small-number cases without arithmetic.
+- `toString`: builds on demand. Debug-only path, no caching needed.
+
+Both gen=0 (no tag set; `fastOf` skips the upstream constructor)
+and gen!=0 (tag set by upstream's constructor but our overrides
+ignore it) work. The gen!=0 path's tag string is
+allocated-then-wasted (~18 % of refs × ~50 K instances × ~30 B
+= ~1 MB), bounded enough not to be worth patching the upstream
+constructor for.
+
+### Measured heap
+
+Process phase, 512 B sampling, paired runs vs the
+`fast-array-onebuf` baseline:
+
+| Allocator                       | Pre (MB) | Post (MB) | Delta              |
+|---------------------------------|---------:|----------:|-------------------:|
+| `parseIndirectObjectHeader`     |    13.7  |     9.3   | **-4.3 MB**        |
+| `fastOf` (refs)                 |     7.7  |     4.8   | **-2.9 MB**        |
+| Total sampled                   |    51.9  |    45.2   | **-6.7 MB (-13 %)** |
+
+The `parseArray` row was already collapsed by `fast-array-onebuf`,
+so this round attacks the next-largest remaining attribution. The
+residual 9.3 MB at `parseIndirectObjectHeader` and 4.8 MB at
+`fastOf` are the `PDFRef` instances themselves (`Object.create` +
+`objectNumber` + `generationNumber` fields, ~32-48 B × ~150 k)
+plus V8 inlining leakage from the `fastOf` speculation call site.
+Hard floor without dropping per-PDFRef wrappers entirely (which
+the class-shape round below picks up).
+
+### Measured CPU
+
+Pinned 0x5500 / High, no profiler, 4 runs each side:
+
+| State    |  median  |   mean   |
+|----------|---------:|---------:|
+| with-tag | 1.045 s  | 1.045 s  |
+| tagless  | 1.030 s  | 1.030 s  |
+| Δ        | ~15 ms tagless faster (in the noise but trending) |
+
+### Validation
+
+Output PDF is byte-identical to baseline modulo `/CreationDate`
++ `/ModDate` timestamps -- verified by inflating + diffing all
+453 ObjStm streams. The change is local to
+[`docs/lib/fast-refs.mjs`](../../docs/lib/fast-refs.mjs); no
+production import or flag change needed since `--fast-refs` was
+already wired up.
+
+## `skipJibberish` digit-byte fast path
+
+The same `find-heap-callers.mjs` chain that surfaced the `PDFRef.tag`
+churn (previous section) named another redundancy worth chasing on
+the CPU side:
+
+```
+parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+  → matchIndirectObjectHeader (try/catch wrapper)
+    → parseIndirectObjectHeader → fastOf
+```
+
+`skipJibberish` runs after every successful indirect object parse
+and exists only to recover from invalid PDFs that wedge garbage
+between indirect objects. Its hot path fires ~150 k times per load
+on the book, each call speculatively running:
+
+1. `matchKeyword('xref' / 'trailer' / 'startxref')` -- all fail on a
+   digit byte.
+2. `matchIndirectObjectHeader` -- a `try` / `catch` around
+   `parseIndirectObjectHeader` → `parseRawInt` × 2 →
+   `matchKeyword('obj')` → `fastOf` round-trip. The speculation
+   succeeds every time on a valid PDF, the cursor rewinds, and the
+   outer `while`'s `IsDigit` check confirms what the speculation
+   already proved.
+
+### Where the speculation lives
+
+`PDFParser.parseDocument`'s inner loop already calls
+`skipWhitespaceAndComments` between indirect objects. Patch a
+single-byte peek in front of `skipJibberish`:
+
+```js
+if (!this.bytes.done() && IsDigit[this.bytes.peek()]) continue;
+this.skipJibberish();
+```
+
+When the next byte is a digit (start of the next `N M obj` header
+on every valid PDF), `continue` skips straight to the next
+`parseIndirectObject`. Anything else (`xref` / `trailer` /
+`startxref` keyword starts, or real jibberish between indirect-object
+sections) falls through to `skipJibberish` unchanged.
+
+The once-per-section `skipJibberish` in `parseDocumentSection`
+(after `maybeParseTrailer`) is unaffected -- it handles boundaries
+between PDF revisions / EOF where stray bytes are spec-legal.
+
+### Measured CPU
+
+Pinned 0x5500 / High, no profiler, 4 paired runs:
+
+| State                | median  | mean    |
+|----------------------|--------:|--------:|
+| without fast path    | 1.07 s  | 1.053 s |
+| with fast path       | 0.995 s | 0.985 s |
+| Δ                    | ~67 ms faster (mean), ~6 % of process phase |
+
+Phase breakdown isolates the win to load (mean 0.518 → 0.455 s,
+-62 ms); save is flat as expected -- the fast path is load-side
+only.
+
+### Heap
+
+Unchanged (0 MB delta). The `PDFRef` instances the speculation
+allocated were already attribution-shifted to the real
+`parseIndirectObject`'s cache miss, not new allocations. The
+fast-path skips the speculation's `try` / `catch` + dispatch
+overhead, not its allocation tail.
+
+### Validation
+
+Output PDF byte-identical to the pre-patch baseline (verified by
+inflating + diffing all 453 ObjStm streams modulo `/CreationDate`
++ `/ModDate` timestamps). The change is local to
+[`docs/lib/fast-sync-load.mjs`](../../docs/lib/fast-sync-load.mjs);
+no production import or flag change needed since `--fast-sync-load`
+was already wired up.
+
+## Class-constructor `PDFRef` shape
+
+The `Object.create + writes` trick the original `fast-refs` shim uses
+to skip the upstream `ENFORCER` check and `pool.set` (see [Skip
+`PDFRef` `pool.set` on the gen=0 miss path](#skip-pdfref-poolset-on-the-gen0-miss-path)
+above) carries an unexpected per-instance cost: V8 transitions the
+hidden class through one intermediate map per property write and
+routes the result through the slow-property path. On the book a
+fast-refs-built PDFRef sits at ~60 B/instance vs PDFName's ~31 B
+(built via `new PDFName(...)` -- a real constructor with a stable
+hidden class from the first instance).
+
+### The shim
+
+Plain function used as a constructor, both fields set in one shot:
+
+```js
+function _FastRef(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+PDFRef.of = function fastClassOf(objectNumber, generationNumber) {
+  if (generationNumber === undefined || generationNumber === 0) {
+    const existing = pool0[objectNumber];
+    if (existing) return existing;
+    const fresh = new _FastRef(objectNumber, 0);
+    pool0[objectNumber] = fresh;
+    return fresh;
+  }
+  return original.call(PDFRef, objectNumber, generationNumber);
+};
+```
+
+Aliasing `_FastRef.prototype = PDFRef.prototype` keeps
+`instanceof PDFRef` satisfied AND means method dispatch resolves
+on the shared prototype (no extra proto-chain hop). gen != 0 still
+falls back to the upstream `PDFRef.of` Map-based pool (rare on
+freshly-parsed PDFs).
+
+Same `toString` / `sizeInBytes` / `copyBytesInto` prototype
+overrides as the tag-drop section above -- the constructor produces
+gen=0 PDFRefs with no `tag` field at all, and the gen!=0 upstream
+fallback still sets `tag` but our overrides ignore it.
+
+### Measured heap
+
+Paired heap profile (`--fast-refs` vs `--fast-refs-class`, with the
+rest of the production shim set on):
+
+| Allocator                       | Pre        | Post       | Delta                  |
+|---------------------------------|-----------:|-----------:|-----------------------:|
+| Total sampled                   |  45.26 MB  |  41.39 MB  | **-3.87 MB (-8.5 %)**  |
+| `fastOf` / `fastClassOf` row    |   4 696 KB |   3 435 KB | -1 261 KB              |
+| `create` (builtin)              |   3 379 KB |   2 627 KB | -752 KB                |
+| `parseIndirectObjectHeader` row |   9 115 KB |   7 435 KB | -1 680 KB              |
+
+Per-PDFRef savings work out to ~16 B/instance × 226 k unique refs
+= ~3.7 MB, close to the measured 3.87 MB total. Not the full
+30 B-to-PDFName-floor (PDFRef carries 2 fields vs PDFName's 1),
+but a clean win and the construction-style change applies
+symmetrically to the other `Object.create`-built shapes
+(`fast-dict-onebuf._makeFromRange`,
+`fast-array-onebuf._makeFromRange`) for the next round.
+
+### Measured CPU
+
+Paired wall-clock and profile (`--cpu-profile-process`):
+
+| Row                        | Pre      | Post     | Delta              |
+|----------------------------|---------:|---------:|-------------------:|
+| Process wall-clock         | 1.13 s   | 0.99 s   | **-140 ms (-12 %)** |
+| load                       | 0.52 s   | 0.47 s   | -50 ms              |
+| save                       | 0.51 s   | 0.44 s   | -70 ms              |
+| `fastOf` (PDFRef) self-time| 28 ms    | out of top 15 | drops off      |
+
+GC self-time barely moved (87 ms → 82 ms), consistent with the
+allocation-rate drop being modest relative to mark-cost -- the live
+`fast-dict-onebuf` mainBuf still dominates the GC bill.
+
+### Wiring
+
+- [`docs/lib/fast-refs-class.mjs`](../../docs/lib/fast-refs-class.mjs)
+  -- new shim. Same `_writeUint` / `_digitCount` helpers as
+  `fast-refs`; same prototype overrides; only the construction style
+  differs.
+- [`docs/render-book.mjs`](../../docs/render-book.mjs) -- swaps
+  `import './lib/fast-refs.mjs'` for `import './lib/fast-refs-class.mjs'`.
+  Production runs through the new shim.
+- [`perf/measure.mjs`](../measure.mjs) -- adds the
+  `--fast-refs-class` flag with a mutex check against `--fast-refs`
+  (both shim `PDFRef.of`; loading both silently would not be
+  obvious if it broke something).
+
+`fast-refs.mjs` stays in the tree as an A/B baseline -- the
+construction style is the whole point of the comparison, so being
+able to flip back to the older shape with a flag is worth the
+20 lines of duplication.
+
+## Class-constructor `PDFDict` shape
+
+The same shape change `fast-refs-class` applied to PDFRef (above),
+now applied to the four PDFDict subclasses fast-dict-onebuf
+constructs: `PDFDict`, `PDFCatalog`, `PDFPageTree`, `PDFPageLeaf`.
+
+### Where fast-dict-onebuf was paying the same V8 tax
+
+`_makeFromRange` and the COW path inside `set` both build the
+wrapper instance via `Object.create(ProtoClass.prototype) + pd.d
+= ...` (plus `pd.normalized = false` / `pd.autoNormalizeCTM = true`
+for the PageLeaf case). On the book that's 260 k+ wrapper
+instances per load -- the dominant remaining heap row even after
+all the prior storage-shape work, with `_makeFromRange (dict)`
+showing 16.5 MB on the post-`fast-refs-class` profile.
+
+### The shim
+
+One plain-function constructor per subclass with the field
+assignments in the body. Aliasing each one's prototype to the
+upstream prototype keeps `instanceof` and method dispatch
+unchanged.
+
+```js
+function _FastDict(d) { this.d = d; }
+_FastDict.prototype = PDFDict.prototype;
+
+function _FastCatalog(d) { this.d = d; }
+_FastCatalog.prototype = PDFCatalog.prototype;
+
+function _FastPageTree(d) { this.d = d; }
+_FastPageTree.prototype = PDFPageTree.prototype;
+
+function _FastPageLeaf(d) {
+  this.d = d;
+  this.normalized = false;
+  this.autoNormalizeCTM = true;
+}
+_FastPageLeaf.prototype = PDFPageLeaf.prototype;
+
+function _makeFromRange(ProtoClass, start, length, ctx) {
+  _registerContext(ctx);
+  const d = pack(start, length);
+  if (ProtoClass === PDFDict)      return new _FastDict(d);
+  if (ProtoClass === PDFPageLeaf)  return new _FastPageLeaf(d);
+  if (ProtoClass === PDFCatalog)   return new _FastCatalog(d);
+  if (ProtoClass === PDFPageTree)  return new _FastPageTree(d);
+  // Defensive fallback for any unknown subclass.
+  const pd = Object.create(ProtoClass.prototype);
+  pd.d = d;
+  return pd;
+}
+```
+
+PageLeaf carries the extra `normalized` / `autoNormalizeCTM`
+fields -- they're assigned in the constructor body so V8 still sees
+a fixed shape per subclass. The COW path in `set` is updated in
+the same way (`return new _FastDict(pack(newStart, length))`).
+Unknown PDFDict subclasses fall back to the original Object.create
+path; nothing in our pipeline hits it (defensive only).
+
+### Measured heap
+
+Paired profile, `fast-refs-class` baseline vs + this change:
+
+| Allocator                       | Pre        | Post       | Delta              |
+|---------------------------------|-----------:|-----------:|-------------------:|
+| Total sampled                   |  41.39 MB  |  35.41 MB  | **-5.98 MB (-14.4 %)** |
+| `_makeFromRange` (dict)         |  16 484 KB |  11 404 KB | -5 080 KB          |
+| `create` (builtin)              |   2 627 KB |     921 KB | -1 706 KB          |
+| `_FastDict` (new row)           |     —      |     621 KB | +621 KB            |
+
+Per-PDFDict saving: ~20 B/instance × 260 k = ~5.2 MB. Matches the
+`_makeFromRange` delta + the builtin's drop minus the new
+constructor-frame attribution.
+
+**Cumulative since `fast-refs-class`**: total sampled 45.26 MB →
+35.41 MB = **-9.85 MB (-22 %)** over two shape-change commits.
+Bringing the cumulative heap reduction since the Map-backed
+baseline to ~77 % (152 MB → 35.4 MB).
+
+### Measured CPU
+
+Roughly flat -- process wall-clock 0.99 s → 1.03 s under cpu
+profile, within noise. GC self-time +18 ms (82 → 101 ms),
+consistent with the existing `fast-dict-onebuf` trade-off
+documented in the README: the dominant GC cost on this workload
+is the live `mainBuf` scan, not allocation rate, so cutting
+allocation doesn't move single-shot mark time. The
+allocation-rate reduction still matters for sustained-load
+memory pressure even when it doesn't show on a one-shot
+wall-clock.
+
+### Validation
+
+Output PDF byte-identical modulo `/CreationDate` + `/ModDate`
+timestamps -- only the JS object shape used to wrap the parsed
+dict range changed, not any content path. The change is local to
+[`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs);
+no production import or flag change needed since
+`--fast-dict-onebuf` was already wired up.
+
+## Class-constructor `PDFArray` shape
+
+The same shape change applied to PDFArray's factory paths. PDFArray
+has no subclasses in pdf-lib (unlike PDFDict), so a single
+`_FastArray` constructor covers both `_makeFromRange` and the COW
+path inside `set`:
+
+```js
+function _FastArray(d) { this.d = d; }
+_FastArray.prototype = PDFArray.prototype;
+
+function _makeFromRange(start, length, ctx) {
+  _registerContext(ctx);
+  return new _FastArray(pack(start, length));
+}
+```
+
+### Measured heap
+
+Paired profile, prior commit's dict-class baseline vs + this
+change:
+
+| Allocator                       | Pre        | Post       | Delta              |
+|---------------------------------|-----------:|-----------:|-------------------:|
+| Total sampled                   |  35.41 MB  |  33.68 MB  | **-1.73 MB (-4.9 %)** |
+| `fastParseArrayOneBuf` row      |   4 372 KB |   3 334 KB | -1 038 KB          |
+| `create` (builtin)              |     921 KB | out of top 15 | -921 KB        |
+
+Per-PDFArray saving: ~22 B/instance × ~80 k = ~1.7 MB. Matches the
+row delta + builtin drop.
+
+### Measured CPU -- the unexpected GC win
+
+| Row                | Pre       | Post     | Delta              |
+|--------------------|----------:|---------:|-------------------:|
+| Process wall-clock | 1.03 s    | 0.90 s   | **-130 ms (-13 %)** |
+| GC self-time       | 100.9 ms  | 58.7 ms  | **-42 ms (-42 %)**  |
+
+A surprising GC + wall-clock win for the smallest of the three
+heap drops. The likely reason is that with all three shape changes
+in place, V8 sees fully monomorphic call sites for PDFRef /
+PDFDict / PDFArray construction *and* method dispatch -- before
+the array change there was still one slow-property shape in the
+mix dragging IC perf. Confirmed by the cumulative process arc:
+
+| State                                  | process  | GC     |
+|----------------------------------------|---------:|-------:|
+| baseline (fast-refs)                   | 1.13 s   | 87 ms  |
+| + fast-refs-class                      | 0.99 s   | 82 ms  |
+| + fast-dict-onebuf class shape         | 1.03 s   | 101 ms |
+| + fast-array-onebuf class shape        | **0.90 s** | **59 ms** |
+
+The dict-only state had a slight CPU regression (+40 ms vs
+fast-refs-class) that the array change undid and then some.
+Argues strongly for shipping the full combo, not just the two
+big-heap-row ones.
+
+### Cumulative across the three shape-change commits
+
+Baseline (`fast-refs`) → all-three (`fast-array-onebuf class
+shape`):
+
+| Metric              | Pre        | Post       | Delta                |
+|---------------------|-----------:|-----------:|---------------------:|
+| Process wall-clock  | 1.13 s     | 0.90 s     | **-230 ms (-20 %)**  |
+| Total sampled heap  | 45.26 MB   | 33.68 MB   | **-11.58 MB (-25.6 %)** |
+| GC self-time        | 87 ms      | 59 ms      | **-32 %**            |
+
+Cumulative process-phase heap reduction since the Map-backed
+PDFDict baseline now stands at **~78 %** (152 MB → 33.7 MB).
+
+### Validation
+
+Output PDF byte-identical modulo `/CreationDate` + `/ModDate`
+timestamps. The change is local to
+[`docs/lib/fast-array-onebuf.mjs`](../../docs/lib/fast-array-onebuf.mjs);
+no production import or flag change needed since
+`--fast-array-onebuf` was already wired up.
+
+## Class-constructor round: closing the picture
+
+Recap of the three commits that just landed (PDFRef, PDFDict,
+PDFArray wrapper-shape changes): same attack, same constructor +
+prototype-aliasing trick. The per-instance numbers, before vs
+after, in one table:
+
+| Wrapper       | Before | After | Saved/inst | Count   | Total saved |
+|---------------|-------:|------:|-----------:|--------:|------------:|
+| PDFRef        |  ~60 B | ~44 B |     ~16 B  | 226 k   |   ~3.7 MB   |
+| PDFDict       |  ~64 B | ~44 B |     ~20 B  | 260 k   |   ~5.2 MB   |
+| PDFArray      |  ~54 B | ~32 B |     ~22 B  |  80 k   |   ~1.7 MB   |
+
+PDFRef stops at ~44 B because it carries 2 fields (`objectNumber`,
+`generationNumber`); PDFDict / PDFArray stop at ~32-44 B with 1
+field (the packed `d`). PDFPageLeaf carries 3 fields (d,
+normalized, autoNormalizeCTM) so it's slightly higher, but the
+constructor body still gives V8 the stable shape -- the 1 651
+PDFPageLeaf instances are a small tail.
+
+### Investigation aside: `parseIndirectObjectHeader` was a labelling artifact
+
+The hypothesis chain that led to the constructor-shape attack:
+
+1. Start: heap profile shows `parseIndirectObjectHeader` at 9.1 MB
+   self-attribution. Looks like a parser hot spot worth attacking.
+2. Hand-inline the entire function body (whitespace skip +
+   `parseRawInt` × 2 + `matchKeyword` + `PDFRef.of`) into a single
+   no-call body. Heap row barely moved (9.2 MB), CPU unchanged --
+   the row wasn't the call overhead.
+3. Disable V8 inlining with `node --no-turbo-inlining`. Heap row
+   collapses (9.2 MB → out of top 20). `fastOf` row jumps from
+   4.7 MB to 13.8 MB. Total sampled unchanged.
+
+Diagnosis: V8 inlines small hot leaf functions (like `fastOf`,
+when called from a hot caller) and attributes their allocations
+to the inliner's frame. The `parseIndirectObjectHeader` row name
+was misleading; the actual allocation source was the PDFRef
+instances being constructed downstream. Attacking the right thing
+(the wrapper shape) made the row drop too.
+
+The hand-inlined attempt (`fast-pioh.mjs`) was deleted after
+proving the negative; the call-counting instrumentation lives in
+[`perf/instrument-pioh.mjs`](../instrument-pioh.mjs). Both kept
+around in the writeup as the path to the right answer rather than
+the answer itself.
+
+### Caveats
+
+- **Singleton subclass set.** `fast-dict-onebuf` dispatches by
+  `ProtoClass === PDFDict | PDFCatalog | PDFPageTree |
+  PDFPageLeaf` to pick the right constructor. Any new PDFDict
+  subclass added in user code falls back to the original
+  `Object.create` path (defensive; nothing in our pipeline
+  triggers it). If the upstream PDFDict hierarchy grows, the
+  dispatch chain needs a new entry.
+- **Shared prototype.** `_FastRef.prototype = PDFRef.prototype`
+  means a `new _FastRef(...)` instance is indistinguishable from
+  a `new PDFRef(...)` instance via `instanceof` and method
+  dispatch. No code in our pipeline cares about constructor
+  identity (`obj.constructor === PDFRef` -- absent in pdf-lib +
+  our shims).
+- **Method dispatch stays polymorphic for gen != 0 PDFRefs.** The
+  `--fast-refs-class` shim only routes gen=0 through the
+  `_FastRef` constructor; gen != 0 falls back to upstream
+  `PDFRef.of` which uses its own Map-based pool and
+  `new PDFRef(...)`. Both shapes share `PDFRef.prototype` so
+  methods dispatch uniformly; V8 may see 2 maps but the path is
+  rare (~18 % of refs).
+
+### Where this leaves heap
+
+After the three commits, the top-5 heap rows are:
+
+| # | Self KB | Frame                                                              |
+|--:|--------:|--------------------------------------------------------------------|
+| 1 | 11 474  | `_makeFromRange` (PDFDict) -- 260 k × ~44 B floor                  |
+| 2 |  7 450  | `parseIndirectObjectHeader` -- V8 attribution of the next row      |
+| 3 |  3 411  | `fastClassOf` (PDFRef) -- 226 k × ~44 B floor                      |
+| 4 |  3 334  | `fastParseArrayOneBuf` (PDFArray) -- 80 k × ~32 B floor            |
+| 5 |  2 098  | `parseIndirectObjectSync` -- per-call attribution residual         |
+
+The big rows are now at the per-instance floor for V8 objects
+with 1-2 inline fields. Further heap reduction requires either:
+
+1. **Eliminate the wrapper entirely** -- PDFRef / PDFDict /
+   PDFArray become bare packed Numbers, every consumer rewritten
+   to call free functions instead of methods. Biggest remaining
+   win (~11 MB on PDFDict alone), largest engineering surface.
+2. **Smaller targeted shrinks** -- PDFNumber drops eager
+   `stringValue` cache, etc. Each at ~hundreds of KB,
+   accumulating slowly.
+
+Neither has been started; this section closes the per-instance
+constructor-shape round.
+
+## Byte-keyed cache for `PDFName` lookups
+
+After the constructor-shape round closed, the new #1 row in the
+process CPU profile was `PDFObjectParser.prototype.parseName` at
+**87 ms self + 57 ms via its `fastOf` callee = ~144 ms combined
+(~16 % of process)**. The function fires **1.68 M times per load**
+on the book. Worth interrogating before treating as a hot loop.
+
+### What `parseName` does
+
+```js
+PDFObjectParser.prototype.parseName = function () {
+    this.bytes.assertNext(CharCodes.ForwardSlash);
+    var name = '';
+    while (!this.bytes.done()) {
+        var byte = this.bytes.peek();
+        if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+        name += charFromCode(byte);   // per-byte cons-string append
+        this.bytes.next();
+    }
+    return PDFName.of(name);
+};
+```
+
+Two obvious-looking attack surfaces:
+
+1. **Per-byte method dispatch** (`this.bytes.peek()`, `.next()`,
+   `.done()`) -- ~16 M method calls across the load.
+2. **Per-byte string concat** (`name += charFromCode(byte)`) --
+   ~16 M cons-string appends, allocating an intermediate state per
+   byte until V8 flattens at the `PDFName.of(name)` lookup.
+
+### Two false starts
+
+The first version of `fast-parse-name.mjs` (not committed) kept
+the cons-string accumulator but read `this.bytes.bytes` /
+`.idx` / `.length` directly to skip the per-byte ByteStream
+dispatch:
+
+```js
+let name = '';
+while (idx < len) {
+  const byte = buf[idx];
+  if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+  name += String.fromCharCode(byte);
+  idx++;
+}
+```
+
+Result: process wall-clock essentially flat (0.90 → 1.00 s,
+within noise). `parseName` self dropped out of the top 15 -- but
+the saved time migrated to attribution on the callers
+(`fastParseDictOneBuf` +35 ms, `fastParseObject` +21 ms) under
+V8 inlining and GC self +15 ms. The total didn't move.
+
+Lesson: **V8 already optimises the cons-string `+=` path well.**
+Per-byte appends use cons-string representation that defers the
+flatten until first use (`PDFName.of`'s string-hash lookup forces
+it). The method-call dispatch was apparently not the dominant
+cost; the cons-string flatten + `Map<string, PDFName>` lookup
+was.
+
+A second sketch (also not committed) built the lookup string via
+`String.fromCharCode.apply(null, buf.subarray(start, idx))` --
+the textbook "one-shot allocation" approach. It made things
+**worse** (~123 ms vs upstream ~87 ms). `.apply` on a Uint8Array
+view is a V8 deopt path: the engine has to convert each
+typed-array element to a stack arg, and the overhead exceeds the
+cons-string build it was meant to replace.
+
+Both attempts missed the actual surface.
+
+### The real surface: 99.7 % of the work is cache hits
+
+The instrumentation script
+[`perf/instrument-objclasses.mjs`](../instrument-objclasses.mjs)
+reports:
+
+| | calls    | unique | hit rate |
+|---|--------:|-------:|---------:|
+| `PDFName.of` | 1 681 225 | 4 787 | **99.715 %** |
+
+Of every 1 000 `parseName` calls, **997** hand the same byte
+sequence to `PDFName.of`'s string-keyed Map, get back the same
+`PDFName` instance, and discard the lookup string. The string was
+built, hashed, looked up, and thrown away -- the answer was
+already known.
+
+`Type`, `Length`, `Pages`, `MediaBox`, `Resources`, `Contents`,
+`Parent`, `Kids`, `Count`, `Font`, ... -- a few thousand names
+appear over and over across 260 k dicts.
+
+### The byte-cache
+
+Add a second cache layer in front of `PDFName.of`, keyed by the
+byte content of the name body (not the constructed string).
+
+```js
+// Scan body + Java-style hash in one pass. Smi math, no allocs.
+let hash = 0;
+while (idx < len) {
+  const byte = buf[idx];
+  if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+  hash = (hash * 31 + byte) | 0;   // Smi
+  idx++;
+}
+
+// Map<hash, Entry | Entry[]>. Single-entry buckets are the common
+// case (4.8 k names into 2^32 hash space -> ~0 collisions).
+const bucket = byteCache.get(hash);
+if (bucket instanceof Entry) {
+  if (_bytesEqual(bucket.bytes, buf, start, idx)) return bucket.name;
+}
+// ... collision-bucket scan, then miss path.
+```
+
+On hit -- 99.7 % of calls -- return the cached `PDFName` with
+**zero string allocation, zero `Map<string, ...>` hashing**. Just
+a hash compute, a `Map<number, ...>` lookup, a `Uint8Array`
+equality check.
+
+On miss, build the string in one shot (`String.fromCharCode`
+with direct args via the existing fast path -- not `.apply` on a
+typed-array view), route through `PDFName.of` (which on this
+stack is fast-decode-name's string-keyed Map), and cache the
+returned `PDFName` in the byte-cache for next time. Both caches
+converge on the same `PDFName` instance per logical name, so
+identity comparisons (`name === PDFName.Type`) keep working
+everywhere downstream.
+
+### Bucket shape: `Entry | Entry[]`
+
+The Map values are polymorphic on purpose. Single-entry buckets
+store the `Entry` directly; on collision we promote to an
+`Entry[]` for linear scan. For 4.8 k unique names hashed into
+2^32 space, the expected collision count is ~0 (birthday bound).
+The polymorphic check (`bucket instanceof Entry`) only fires once
+per lookup, no IC degradation observed in practice.
+
+Stable hidden class for `Entry`: a plain class with
+`{ bytes, name }` set in the constructor body. Same pattern as
+`fast-refs-class` / `fast-dict-onebuf`'s `_FastDict` etc -- avoid
+`Object.create + writes`, give V8 a fixed shape from the start.
+
+### Hash function: Java-style `hash * 31 + byte`
+
+```js
+hash = (hash * 31 + byte) | 0;
+```
+
+- The `| 0` keeps `hash` in 32-bit signed Smi range, which V8
+  represents as an unboxed integer (no `HeapNumber` allocation).
+- `* 31` compiles to `(x << 5) - x` which is cheap.
+- Length is implicit in the iteration count (different-length
+  names with the same byte sums hash differently).
+- Collisions for the 4.8 k unique book names are zero in practice;
+  even if they occurred, the bucket scan catches them.
+
+FNV-1a was considered but adds two more shift-add ops per byte
+without measurable improvement for this collision count.
+
+### Measured
+
+Paired heap + cpu profile, baseline = the array-class state from
+the constructor-shape round, this on top:
+
+| Frame                          | Pre (ms) | Post (ms) | Delta              |
+|--------------------------------|---------:|----------:|-------------------:|
+| `PDFObjectParser.parseName`    |    87.14 |  (gone)   | **-87+ ms** (out of top 15) |
+| `fastOf` (PDFName decode-name) |    52.76 |  (gone)   | **-52+ ms** (out of top 15) |
+| `fastParseName` (new row)      |       -- |    58.52  | +58 ms (the cache lookup itself) |
+| `(garbage collector)`          |    58.69 |    80.31  | +21 ms (live-cache mark cost) |
+| Combined parseName + fastOf    |    ~144  |    ~58    | **-86 ms (-60 %)** |
+
+| Phase / metric                | Pre      | Post     | Delta               |
+|-------------------------------|---------:|---------:|--------------------:|
+| Process wall-clock (cpu run)  |  0.90 s  |  0.82 s  | **-80 ms (-9 %)**   |
+|   load                        |  0.41 s  |  0.33 s  | -80 ms              |
+|   save                        |  0.42 s  |  0.42 s  | flat                |
+| Heap (sampled total)          | 33.68 MB | 34.98 MB | +1.30 MB (cache)    |
+|   new `fastParseName` row     |        0 | 1 269 KB | the cache itself    |
+|   `set` (builtin)             |   624 KB |   852 KB | +228 KB (Map.set)   |
+
+The CPU win is all on load (which is where `parseName` runs);
+save is unchanged. Heap is +1.3 MB long-lived (the cache + 4.8 k
+`Uint8Array` byte-keys + Entry objects + `Map<number, ...>`
+overhead), a fixed cost for a workload-bounded cache.
+
+### A note on the heap-profile wall-clock
+
+Under `--heap-profile-process --heap-sampling 512`, the same run
+shows a much bigger speedup than the cpu-profile run:
+
+|                                  | Pre (heap-prof) | Post (heap-prof) | Delta    |
+|----------------------------------|----------------:|-----------------:|---------:|
+| Process wall-clock (heap run)    |          3.50 s |           2.56 s | -940 ms  |
+
+That 940 ms is **not a real wall-clock win** -- it's the
+sampler's per-allocation bookkeeping overhead dropping in step
+with the ~1.6 M transient allocations we just eliminated. The
+sampler fires once every 512 B; even at 64 B per allocation
+that's ~12 % of allocations sampled, but the bookkeeping work
+runs on **every** allocation to decide whether to sample.
+
+Read the cpu-profile number (-80 ms) for "did we get faster";
+read the heap-row delta (+1.3 MB) for "what's the long-lived
+cost". The 940 ms drop under heap profile is a secondary signal
+that confirms the allocation count dropped a lot even though most
+of those allocations were under the 512 B sample threshold and
+don't appear in the heap table.
+
+### Caveats
+
+- **Cache is process-lifetime.** Same as fast-decode-name. No
+  eviction; on the book the long-lived size stabilises at
+  ~1.3 MB (4.8 k entries × ~270 B amortised). For a workload
+  with very many unique names this would grow; for PDFs it
+  doesn't.
+- **`Map<number, value>` for the hash bucket.** V8's Map handles
+  Smi keys well, but allocates Map entry objects on `.set`.
+  The +228 KB in the `set` builtin row is mostly that.
+- **The byte-cache and fast-decode-name's string-cache are not
+  the same Map.** Both cache `PDFName` lookups, keyed
+  differently. Direct `PDFName.of("Foo")` calls (from non-parser
+  code) skip the byte-cache and hit fast-decode-name directly;
+  subsequent parser hits on the same name use the byte-cache.
+  Both return the same `PDFName` instance because the miss path
+  of the byte-cache goes through `PDFName.of`, which is
+  fast-decode-name.
+
+### What this teaches
+
+The two failed first attempts share a lesson: **V8 is good at
+the things you'd naively want to avoid** (cons-string `+=`,
+method dispatch through a small wrapper class). The wins come
+from eliminating the actual repeated work, not from rewriting
+the loop body.
+
+`parseName` looked like a hot loop. It was actually a hot lookup
+that built the lookup key by hand on every call. Move the
+key-build out of the hot path -- by caching the answer keyed on
+the raw input -- and the loop becomes irrelevant. Same lesson as
+the constructor-shape round (`Object.create + writes` is slow
+because V8 takes a different IC path, not because the writes
+themselves are slow); same lesson likely lurking in other "hot
+loop" rows in the profile.
+
+### Shipped
+
+[`docs/render-book.mjs`](../../docs/render-book.mjs) adds
+`import './lib/fast-parse-name.mjs';` next to the other parser-
+shim imports. The shim is idempotent on import and global on
+install -- no opt-out at production, the `--fast-parse-name`
+flag exists for A/B harness work.
+
+## Pipeline the deflate: overlap buffer-build with libuv
+
+After `fast-parse-name` shipped, the CPU profile of the process
+phase showed `PDFObjectStream.getUnencodedContents` at #4 (45.97
+ms self, 123.75 ms with callees) and a fat `(idle)` row at
+31.82 ms / 3.4 %. The two are joined at the hip:
+`parallel-deflate.mjs`'s phase 2 ran the build and the deflate
+in two strictly serial passes:
+
+```js
+// before
+const unencoded = objectStreams.map(os => os.getUnencodedContents());        // ~120 ms main-thread block
+const deflated = await Promise.all(unencoded.map(buf => deflateAsync(buf))); // ~30 ms main-thread idle
+```
+
+Pass 1 built all 453 buffers on the main thread (the
+`getUnencodedContents` total subtree). Pass 2 fired all 453
+deflates into libuv at once and awaited them as a batch. The
+`(idle)` row was the main thread sleeping during that await.
+
+### The fix: fold the two `.map`s
+
+```js
+// after
+const deflated = await Promise.all(
+  objectStreams.map(os => deflateAsync(os.getUnencodedContents())),
+);
+```
+
+`.map` still iterates 453 times sequentially on the main thread,
+but each iteration now does build + dispatch in one step.
+`deflateAsync(buf)` returns a Promise immediately and the libuv
+worker picks up the buffer while the main thread starts building
+the next one. By the time the build loop finishes at ~120 ms,
+the first ~430 deflates have already run on the 4-worker pool
+(each takes ~0.3 ms compute); only the last handful are still in
+flight. The `await Promise.all` resolves almost immediately.
+
+### Why the savings are bounded
+
+The build loop is ~120 ms of main-thread JS; the total deflate
+compute is ~130 ms across 4 libuv threads, i.e. ~33 ms of wall.
+Pipelining overlaps the 33 ms of deflate-wall with the 120 ms of
+build-wall. Max possible win: the 33 ms idle. Build itself stays
+single-threaded -- pipelining can't shrink that.
+
+A bigger win would require putting the build itself on workers,
+but `getUnencodedContents` dispatches `.copyBytesInto()` on
+PDFDict / PDFArray / PDFNumber / PDFName / PDFRef / PDFString /
+PDFStream wrappers, and JS object wrappers can't cross
+`worker_threads` boundaries (the byte ranges live in `mainBuf` /
+`arrayMain`, but the dispatch logic is in pdf-lib + our shims).
+Either we duplicate ~500 lines of byte-emission into a worker
+file with SharedArrayBuffer views of the buffers, or we rewrite
+it as a native addon. Neither pays for itself at this row size.
+
+### Measured wins
+
+A/B on the book, 3 runs each, same shipped flag set, paired
+`--cpu-profile-process` (Windows /affinity-pinned):
+
+| Run                | process | load   | setOutline | save   |
+|--------------------|--------:|-------:|-----------:|-------:|
+| baseline A         |  0.89 s | 0.34 s |    0.01 s  | 0.48 s |
+| baseline B         |  0.90 s | 0.35 s |    0.01 s  | 0.47 s |
+| baseline C         |  0.87 s | 0.35 s |    0.01 s  | 0.45 s |
+| **baseline avg**   | **0.887 s** | 0.347 s | 0.010 s | **0.467 s** |
+| pipelined A        |  0.84 s | 0.34 s |    0.01 s  | 0.43 s |
+| pipelined B        |  0.83 s | 0.34 s |    0.01 s  | 0.42 s |
+| pipelined C        |  0.83 s | 0.34 s |    0.01 s  | 0.41 s |
+| **pipelined avg**  | **0.833 s** | 0.340 s | 0.010 s | **0.420 s** |
+| **delta**          | **-54 ms (-6.1 %)** | flat | flat | **-47 ms (-10.1 %)** |
+
+Load is flat (as expected -- no change touched it). Save dropped
+47 ms consistently across all 3 runs. The smoking gun in the CPU
+profile: baseline's `(idle)` row sat at 21 ms / 2.8 % (rank #9 of
+top 15); after pipelining the row drops out of the top 15
+entirely. That's the deflate-await idle being absorbed into the
+build wall, exactly as predicted.
+
+`getUnencodedContents` self-time also dropped (31.56 → 22.25 ms
+in the paired profiles), probably because in the baseline its
+samples were sandwiched between a sync build and a sync await
+with no other work to attribute against; in the pipelined version
+V8 task scheduling between the build and the fire-and-forget
+Promise creation absorbs some of that attribution. Either way the
+row stays in the top 15 -- the build itself is unchanged.
+
+### Why estimate (~32 ms) and actual (~47 ms) differ
+
+The estimate was derived from the `(idle)` row alone. The actual
+save delta is larger because the await also paid for:
+
+- Microtask-queue drain at the `Promise.all` gate (a few ms
+  across 453 settled promises).
+- libuv callback marshalling for the batch (the 24.75 ms
+  `writeSync` row in the baseline -- the inspector's name for
+  the deflate-result callback, not `fs.writeFileSync`). In the
+  pipelined version those callbacks fire spread out during the
+  build loop instead of bunched at the end.
+
+Both are small but real. Together they explain the gap between
+the 32 ms idle estimate and the 47 ms save delta.
+
+### What this teaches
+
+Two serial `.map`s with an `await` between them is almost always
+a missed pipeline. The fix is mechanical (fold into one `.map`),
+but the win only shows up when the second stage runs on a
+different execution context -- here, libuv's thread pool. For two
+main-thread stages there'd be no overlap to harvest and the diff
+would be a wash.
+
+The `(idle)` row in a CPU profile is the cheapest "next win" to
+spot: any time it's >2 %, there's an `await` somewhere that
+finished before its inputs were ready. Worth grepping for.
+
+### Shipped
+
+In-place edit to
+[`docs/lib/parallel-deflate.mjs`](../../docs/lib/parallel-deflate.mjs)
+at the `parallelSave` path; the harness's `--parallel-deflate`
+flag continues to flip the whole `parallelSave` path on, no new
+knob. `render-book.mjs` already calls `parallelSave` so the
+change is live in production without a new import.
+
+## Pack `PDFPageLeaf` flags into `d`'s gap bits
+
+`fast-dict-onebuf`'s class-shape change (above) left PDFPageLeaf as
+the only subclass with extra fields: `normalized` (default false) +
+`autoNormalizeCTM` (default true), both written in the
+`_FastPageLeaf` constructor body. V8 sees a fixed shape per
+subclass but PageLeaf instances are ~24 B larger than plain
+`_FastDict` (the two boolean slots plus their map entries). 1 651
+PageLeaves on the book × ~24 B = ~26 KB -- sub-row at the 512 B
+sampler resolution, but the same tax was waiting on every other
+`Object.create + writes`-style wrapper we'd want to apply this
+shape to.
+
+The packed `d` had 15 unused bits between the 14-bit length field
+and the 53-bit `Number.MAX_SAFE_INTEGER` ceiling. Two booleans fit
+in two bits.
+
+### The new layout
+
+```
+bits  0-22: start  (23 bits, max 8.4 M slots; mainLen ~2.3 M today)
+bit     23: PDFPageLeaf `normalized` flag (zero on all other subtypes)
+bit     24: PDFPageLeaf `autoNormalizeCTM` flag (zero on all other subtypes)
+bits 25-40: length (16 bits, max 65 535 slots; observed max 8 706)
+bits 41-52: spare (12 bits; available headroom)
+```
+
+`start` drops 24 → 23 bits (8.4 M slots, well above the ~2.3 M
+mainLen seen today); `length` grows 14 → 16 bits (65 535,
+comfortable headroom over the observed max).
+
+PDFPageLeaf collapses to the same single-`d` shape as plain
+PDFDict; `normalized` and `autoNormalizeCTM` become prototype
+getters/setters that mask in/out of bits 23-24.
+
+### The V8 Smi gotcha (worth recording)
+
+V8's Smi (31-bit signed integer) range covers values up to 2^30.
+`start + length * 2^25` stays Smi iff `length < 32`; beyond that
+`d` boxes to a HeapNumber. Two consequences:
+
+1. **Reads stay correct.** `d & MASK_23` lives in the low 32 bits;
+   V8 coerces `d` to Int32 for the `&`, which reads bits 0..30
+   correctly even when `d` is HeapNumber-boxed.
+   `Math.floor(d / POW_25) & MASK_16` operates on the full Number
+   range before the mask truncates.
+2. **Writes must use arithmetic, not bitwise OR.** `d | NORM_BIT`
+   on a HeapNumber'd d truncates to Int32 and loses the high bits
+   (the length). Use `d + NORM_BIT` / `d - NORM_BIT` gated on the
+   current bit state -- arithmetic addition operates on the full
+   Number range.
+
+The setters all follow the pattern:
+
+```js
+set(v) {
+  const d = this.d;
+  const has = (d & NORM_BIT) !== 0;
+  if (v && !has)      this.d = d + NORM_BIT;
+  else if (!v && has) this.d = d - NORM_BIT;
+}
+```
+
+`_cow`, `set` (the COW-on-mutation path), and `delete` all preserve
+the gap bits when they repack `d` by adding `(d & GAP_MASK)` back
+into the freshly packed value. For non-PageLeaf dicts the mask is
+zero so the add is a no-op; for PageLeaf the flags survive any
+backing-buffer move.
+
+### Constructor default
+
+`_FastPageLeaf` defaults `autoNormalizeCTM` to `true` (upstream
+behavior). Since `pack(start, length)` produces a value with bits
+23-24 cleared, the constructor sets bit 24 via addition:
+
+```js
+function _FastPageLeaf(d) { this.d = d + AUTO_BIT; }
+```
+
+Addition not `|` for the same reason -- if `length >= 32`, `d`
+exceeds 2^30 and the boxed Number's high bits would be lost to
+Int32 coercion via `|`.
+
+### Measured
+
+Heap saving on the 1 651 page leaves is sub-row at the 512 B
+sampler resolution but real (~26 KB by per-instance arithmetic).
+Output PDF byte-identical to baseline. CPU flat -- no PDFPageLeaf
+mutation paths fire on the book's render-only workflow.
+
+The change is local to
+[`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs);
+no production import or flag change needed since
+`--fast-dict-onebuf` was already wired up.
+
+## Two-shape `PDFRef`: gen=0 single-slot
+
+`fast-refs-class`'s single-shape constructor still allocates two
+inline slots per PDFRef -- `objectNumber` and `generationNumber`.
+On fresh-Chrome workloads `generationNumber` is **always zero**
+except for the xref "free" entry at object 0; the slot is dead
+weight on 226 k of 226 k instances.
+
+### The shim
+
+Split `_FastRef` into two constructors keyed on whether
+`generationNumber` is needed:
+
+```js
+// gen=0 instances: single inline `objectNumber` slot.
+// `generationNumber` is supplied as a data-property default on
+// PDFRef.prototype (set below), so reads return 0 without any
+// accessor dispatch.
+function _FastRef(objectNumber) {
+  this.objectNumber = objectNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+// gen!=0 instances: both fields as own data properties, shadowing
+// the prototype default. V8 sees a second hidden class -- bounded
+// 2-shape polymorphism, well-handled by inline caches.
+function _FastRefGen(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRefGen.prototype = PDFRef.prototype;
+
+// Default generationNumber on the prototype. _FastRef instances
+// inherit this (no own property); _FastRefGen instances shadow it
+// with their own data property. Both look like data-property reads
+// to V8's IC.
+PDFRef.prototype.generationNumber = 0;
+```
+
+The critical design point: **prototype default, not accessor**. A
+data-property default on the prototype keeps `.generationNumber`
+reads on the hot path as plain data-property loads. V8's monomorphic
+IC for `.generationNumber` covers both shapes uniformly -- the
+property is "present and readable as data" at the same offset in
+the IC's mental model, whether it lives on the instance or one hop
+up the prototype chain.
+
+### The accessor variant that didn't work
+
+A first attempt packed `(objectNumber, generationNumber)` into a
+single `d` field with `objectNumber` / `generationNumber` as
+getter accessors on the prototype:
+
+```js
+// rejected
+function _FastRefPacked(d) { this.d = d; }
+Object.defineProperty(PDFRef.prototype, 'objectNumber', {
+  get() { return this.d & MASK_obj; },
+});
+Object.defineProperty(PDFRef.prototype, 'generationNumber', {
+  get() { return this.d >>> SHIFT_gen; },
+});
+```
+
+Result: **+1.6 MB heap and +70 ms CPU** vs the two-shape variant.
+The accessor-property boundary broke V8's monomorphic ICs at every
+upstream pdf-lib call site that reads `ref.objectNumber` /
+`ref.generationNumber` -- `PDFCrossRefSection.append`,
+`PDFCrossRefStream` entry tuples, `PDFWriter.serializeToBuffer`,
+our `fast-indirect-objects` shim, plus all the small `{ref, offset,
+deleted}` literals in `addEntry`. V8 couldn't elide those object
+literals as aggressively once the property read became an accessor
+dispatch; recompilation paths landed with worse code than the
+two-slot baseline. **Same property name, same return value, but a
+different IC slot type -- and the difference shows up at every
+caller.**
+
+The two-shape variant pays for the win in a bounded place (one
+extra hidden class for the rare gen!=0 path) without touching any
+caller's IC.
+
+### Pool changes
+
+gen=0: same dense array indexed by `objectNumber` (unchanged).
+
+gen!=0: instead of falling back to the upstream `PDFRef.of`'s
+Map-based pool, the shim now keeps its own `poolGenN` Map keyed by
+`"N M"`. This means we never call into upstream PDFRef.of at all
+-- the entire `PDFRef.of` factory is ours. Path is dead on
+fresh-Chrome workloads except for the xref free entry at object 0,
+so the Map stays tiny.
+
+### Measured heap
+
+Paired heap profile (single-shape baseline vs + this change):
+
+| Allocator       | Pre        | Post       | Delta            |
+|-----------------|-----------:|-----------:|-----------------:|
+| Total sampled   |   34.96 MB |   33.08 MB | **-1.88 MB**     |
+
+Per-instance arithmetic: V8 aligns object headers + inline slots
+to 8-byte boundaries. Two-slot `_FastRefGen`-shape: 8 B header +
+2×4 B slots = 16 B raw, aligned to **24 B**. One-slot `_FastRef`
+shape: 8 B header + 1×4 B slot = 12 B raw, aligned to **16 B**.
+8 B saved per gen=0 instance × 226 k unique = ~1.8 MB. Matches
+the measured delta.
+
+### Measured CPU
+
+CPU is essentially flat -- no-profile wall-clock 0.70 s vs ~0.83 s
+pre, but the variance overlaps and the heap-saving lane isn't the
+source of CPU movement. PDF output byte-identical.
+
+### Validation
+
+Output PDF byte-identical -- same `PDFRef` identity per logical
+ref, same prototype methods, all callers see what they always did
+(`ref.objectNumber`, `ref.generationNumber`, `ref.toString()`,
+`ref.copyBytesInto(...)` all work). The change is local to
+[`docs/lib/fast-refs-class.mjs`](../../docs/lib/fast-refs-class.mjs);
+no production import or flag change needed since
+`--fast-refs-class` was already wired up.
+
+### What this teaches
+
+The shape change is interior to PDFRef construction; the IC story
+is at every caller. A two-shape polymorphism on one class is cheap
+when V8 sees it; an accessor-property change on a hot read is
+expensive everywhere that read happens. Prefer adding shape
+variants over swapping data properties for accessors when the
+property is hot.
+
+## `@cantoo/pdf-lib`: not a drop-in replacement
+
+Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
+alternative to Hopding's abandoned `pdf-lib` 1.17.1. Source-diff:
+the four hot paths our shims address (`PDFRef.of`'s string-keyed
+pool, `decodeName`'s unconditional regex, `parseRawInt` /
+`parseRawNumber`'s per-byte string concat,
+`PDFFlateStream.computeContents`'s synchronous pako call) are
+byte-identical to upstream. Paired A/B on the book confirmed:
+cantoo without shims runs the process phase in ~150 s vs our ~1.5 s
+with shims, and has its own footguns (silent compression-disable
+on PDF < 1.5, separate save-path pathology with `useObjectStreams:
+true` that wasn't chased). Not a drop-in replacement; staying on
+Hopding + shims.
+
+## Where this leaves the picture
+
+Cumulative process-phase cost, baseline → after the shims to date:
+
+| state                                | process | load | save |
+| ---                                  | ---     | ---  | ---  |
+| original (Slow / 50 defaults)        | ~40 s   | ~36 s| ~4 s |
+| + parseSpeed:Fastest                 | ~5 s    | ~2 s | ~3 s |
+| + fast-deflate                       | ~2.5 s  | ~1.5s| ~1 s |
+| + fast-refs                          | ~2.3 s  | ~1.3 s | ~1 s |
+| + parallel-deflate                   | ~2.0 s  | ~1.3 s | ~0.7 s |
+| + fast-decode-name + fast-number-to-string | ~1.6 s  | ~1.0 s | ~0.6 s |
+| + fast-size-in-bytes                 | ~1.5 s  | ~1.0 s | ~0.5 s |
+| + fast-dict-iter                     | ~1.4 s  | ~1.0 s | ~0.4 s |
+| + fast-parse-dict                    | ~1.4 s  | ~1.0 s | ~0.4 s |
+| + fast-parse-object                  | ~1.4 s  | ~1.0 s | ~0.4 s |
+| + fast-sync-load                     | ~1.3 s  | ~0.8 s | ~0.5 s |
+| + fast-dict-array                    | ~1.1 s  | ~0.7 s | ~0.4 s |
+| + fast-indirect-objects              | ~1.1 s  | ~0.7 s | ~0.4 s |
+| + fast-refs miss bypass              | ~1.0 s  | ~0.6 s | ~0.4 s |
+| + fast-pdfnumber-pool                | ~1.0 s  | ~0.6 s | ~0.4 s |
+| + parseDict pre-sized array          | ~1.0 s  | ~0.6 s | ~0.4 s |
+| + fast-dict-onebuf                   | ~1.0 s  | ~0.6 s | ~0.4 s |
+| + measure-pass Phase 1               | ~1.0 s  | ~0.7 s | ~0.4 s |
+| + fast-array-onebuf                  | ~1.0 s  | ~0.7 s | ~0.4 s |
+| + fast-refs tag drop                 | ~1.0 s  | ~0.7 s | ~0.4 s |
+| + skipJibberish digit fast-path      | ~0.95 s | ~0.6 s | ~0.4 s |
+| + fast-refs-class                    | ~0.9 s  | ~0.55 s | ~0.4 s |
+| + fast-dict-onebuf class shape       | ~0.9 s  | ~0.55 s | ~0.4 s |
+| + fast-array-onebuf class shape      | ~0.8 s  | ~0.5 s  | ~0.35 s |
+| + fast-parse-name                    | ~0.75 s | ~0.4 s  | ~0.35 s |
+| + pipeline-deflate                   | ~0.7 s  | ~0.4 s  | ~0.3 s  |
+| + PageLeaf flag-packing              | ~0.7 s  | ~0.4 s  | ~0.3 s  |
+| **+ two-shape PDFRef (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
+
+The bottom-up after the latest pair is what's left of pdf-lib's
+genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,
+`PDFObjectParser.parseDict`, GC, with no remaining JS-body row
+sitting on "regex scanning for something that's never there" or
+"redundant `toString` round-trip" shape. The `fastOf` row at
+~91 ms is a real floor for any cache-in-front approach: the
+`indexOf` + `Map.get` cost ~33 ns per call across 2.76 M calls.
+
+The pdf-lib roundtrip path is now ~1.6 s on production
+(profiler-off; the harness reports ~2.0-2.2 s with profiler-on
+attribution overhead). The incremental writer's 0.25 s process
+phase (see [01-baseline-and-detach.md](01-baseline-and-detach.md))
+is still strictly faster on process alone, but the pdf-lib path
+delivers a 15.3 MB output vs incremental's 53 MB, and the ~1.4 s
+gap on a 50 s build doesn't justify the file-size cost for our
+pipeline.
+
+The strategic note from earlier phases still stands: generate's
+~38 s in `page.pdf()` is the remaining lever, and `pageRanges`
+sharding is the only knob plausibly large enough to move the
+wall-clock total by more than a few seconds.
diff --git a/perf/phase0-measure.mjs b/perf/phase0-measure.mjs
new file mode 100644
index 0000000..4013945
--- /dev/null
+++ b/perf/phase0-measure.mjs
@@ -0,0 +1,702 @@
+// Phase 0 prototype: no-allocate measure pass over a PDF byte stream.
+//
+// Walks the PDF grammar as a state machine without instantiating any
+// PDFObject. Counts what would need allocating: indirect objects,
+// dicts and their slot counts, arrays and their slot counts, refs,
+// names, numbers, strings, streams (incl. ObjStms with inflate +
+// inner-object walk), max recursion depth.
+//
+// Then runs PDFDocument.load on the same bytes (with the production
+// shim set imported), so we can compare CPU cost head-to-head.
+//
+// This is a viability gate: if measure-pass is <<load (e.g. <300 ms
+// vs load's 1-2 s), the two-pass measure-then-allocate architecture
+// is worth committing to. If it's not, we revisit.
+//
+// Usage:
+//   node perf/phase0-measure.mjs [path/to/pdf] [--runs N] [--no-load]
+//
+// Defaults: --runs 3, input = most recent perf/results/*/book.pdf.
+
+import { readFileSync, readdirSync, statSync } from 'node:fs';
+import { join, resolve } from 'node:path';
+import { inflateSync } from 'node:zlib';
+import { performance } from 'node:perf_hooks';
+import { createRequire } from 'node:module';
+
+// Production-equivalent shim wiring (same order as docs/render-book.mjs).
+await import('../docs/lib/fast-refs.mjs');
+await import('../docs/lib/fast-inflate.mjs');
+await import('../docs/lib/fast-parse-number.mjs');
+await import('../docs/lib/fast-decode-name.mjs');
+await import('../docs/lib/fast-number-to-string.mjs');
+await import('../docs/lib/fast-size-in-bytes.mjs');
+await import('../docs/lib/fast-dict-onebuf.mjs');
+await import('../docs/lib/fast-parse-object.mjs');
+await import('../docs/lib/fast-sync-load.mjs');
+await import('../docs/lib/fast-indirect-objects.mjs');
+await import('../docs/lib/fast-pdfnumber-pool.mjs');
+
+const require = createRequire(import.meta.url);
+const { PDFDocument } = require('pdf-lib');
+
+// ---- Byte constants -------------------------------------------------
+
+const TAB = 9, LF = 10, FF = 12, CR = 13, SP = 32;
+const LT = 60 /* < */, GT = 62 /* > */;
+const LB = 91 /* [ */, RB = 93 /* ] */;
+const LP = 40 /* ( */, RP = 41 /* ) */;
+const SLASH = 47, PERCENT = 37, BACKSLASH = 92;
+const D0 = 48, D9 = 57;
+const MINUS = 45, PLUS = 43, DOT = 46;
+const a_ = 97, b_ = 98, d_ = 100, e_ = 101, f_ = 102, j_ = 106;
+const l_ = 108, m_ = 109, n_ = 110, o_ = 111, r_ = 114, s_ = 115;
+const t_ = 116, u_ = 117, x_ = 120;
+const R_CH = 82, L_CH = 76, T_CH = 84, N_CH = 78, F_CH = 70;
+
+// ---- Lookup tables (mirror pdf-lib's IsWhitespace / IsDelimiter / IsDigit / IsNumeric) ----
+
+const IsWS = new Uint8Array(256);
+IsWS[0] = IsWS[TAB] = IsWS[LF] = IsWS[FF] = IsWS[CR] = IsWS[SP] = 1;
+
+const IsDelim = new Uint8Array(256);
+IsDelim[LT] = IsDelim[GT] = IsDelim[LB] = IsDelim[RB] = 1;
+IsDelim[LP] = IsDelim[RP] = IsDelim[SLASH] = IsDelim[PERCENT] = 1;
+
+const IsDigit = new Uint8Array(256);
+for (let b = D0; b <= D9; b++) IsDigit[b] = 1;
+
+const IsNumeric = new Uint8Array(IsDigit);
+IsNumeric[DOT] = IsNumeric[MINUS] = IsNumeric[PLUS] = 1;
+
+// ---- Measurer -------------------------------------------------------
+
+class Measurer {
+  constructor(buf) {
+    this.buf = buf;
+    this.pos = 0;
+    this._len = buf.length;
+
+    // Counters
+    this.numIndirectObjects = 0;
+    this.numDicts = 0;
+    this.numDictSlots = 0;
+    this.numArrays = 0;
+    this.numArraySlots = 0;
+    this.numRefs = 0;
+    this.numNames = 0;
+    this.numNumbers = 0;
+    this.numStrings = 0;
+    this.numHexStrings = 0;
+    this.numStreams = 0;
+    this.numObjStms = 0;
+    this.numObjStmInnerObjects = 0;
+    this.maxDictSlots = 0;
+    this.maxArraySlots = 0;
+    this.maxRecursionDepth = 0;
+    this.totalStreamBytes = 0;
+    this.totalInflatedBytes = 0;
+
+    // Dict-frame stack: parseDict pushes a frame and leaves it for
+    // its caller to read (then pop). We track /Length, /Type=/ObjStm,
+    // /N, /First per frame for stream/ObjStm handling.
+    const MAX_DEPTH = 64;
+    this._depth = 0;
+    this._stLength  = new Int32Array(MAX_DEPTH);
+    this._stIsObjStm = new Uint8Array(MAX_DEPTH);
+    this._stN      = new Int32Array(MAX_DEPTH);
+    this._stFirst  = new Int32Array(MAX_DEPTH);
+
+    // Reusable ObjStm offset arrays (grown on demand)
+    this._objNums    = new Int32Array(512);
+    this._objOffsets = new Int32Array(512);
+  }
+
+  // ---- Skip helpers (no allocation) --------------------------------
+
+  skipWS() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b]) { p++; continue; }
+      if (b === PERCENT) {
+        while (p < len && buf[p] !== LF && buf[p] !== CR) p++;
+        continue;
+      }
+      break;
+    }
+    this.pos = p;
+  }
+
+  // Parse an integer in place. No string concat. Returns NaN if no digit.
+  // Does NOT bump numNumbers (used for metadata: header, ObjStm offsets).
+  _skipInt() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos, v = 0, sign = 1, any = 0;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    while (p < len) {
+      const b = buf[p];
+      if (b < D0 || b > D9) break;
+      v = v * 10 + (b - D0);
+      any = 1; p++;
+    }
+    this.pos = p;
+    return any ? sign * v : NaN;
+  }
+
+  // Skip a name (already past '/'); just consume body bytes
+  _skipNameBody() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b] || IsDelim[b]) break;
+      p++;
+    }
+    this.pos = p;
+  }
+
+  // Skip a /name token, bumping numNames
+  skipName() {
+    this.pos++; // skip /
+    this._skipNameBody();
+    this.numNames++;
+  }
+
+  // Skip a literal (...) string, handling escapes
+  skipString() {
+    this.pos++; // skip (
+    const buf = this.buf, len = this._len;
+    let p = this.pos, depth = 1;
+    while (p < len && depth > 0) {
+      const b = buf[p];
+      if (b === BACKSLASH) { p += 2; continue; }
+      if (b === LP) depth++;
+      else if (b === RP) depth--;
+      p++;
+    }
+    this.pos = p;
+    this.numStrings++;
+  }
+
+  // Skip a <hex> string
+  skipHexString() {
+    this.pos++; // skip <
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len && buf[p] !== GT) p++;
+    p++; // skip >
+    this.pos = p;
+    this.numHexStrings++;
+  }
+
+  // ---- Name disambiguation (no allocation) ------------------------
+
+  // Skip /name and tag whether it matched a known stream-related key.
+  // Returns: 0=other, 1=Length, 2=Type, 3=N, 4=First
+  matchDictKey() {
+    const buf = this.buf, len = this._len;
+    this.pos++; // skip /
+    const start = this.pos;
+    let match = 0;
+
+    const b0 = buf[start];
+    if (b0 === L_CH /* L */) {
+      if (start + 6 <= len &&
+          buf[start+1] === e_ && buf[start+2] === n_ &&
+          buf[start+3] === 103 /* g */ && buf[start+4] === t_ &&
+          buf[start+5] === 104 /* h */ &&
+          (start+6 === len || IsWS[buf[start+6]] || IsDelim[buf[start+6]])) {
+        match = 1;
+        this.pos = start + 6;
+      }
+    } else if (b0 === T_CH /* T */) {
+      if (start + 4 <= len &&
+          buf[start+1] === 121 /* y */ && buf[start+2] === 112 /* p */ &&
+          buf[start+3] === e_ &&
+          (start+4 === len || IsWS[buf[start+4]] || IsDelim[buf[start+4]])) {
+        match = 2;
+        this.pos = start + 4;
+      }
+    } else if (b0 === N_CH /* N */) {
+      if (start + 1 === len || IsWS[buf[start+1]] || IsDelim[buf[start+1]]) {
+        match = 3;
+        this.pos = start + 1;
+      }
+    } else if (b0 === F_CH /* F */) {
+      if (start + 5 <= len &&
+          buf[start+1] === 105 /* i */ && buf[start+2] === r_ &&
+          buf[start+3] === s_ && buf[start+4] === t_ &&
+          (start+5 === len || IsWS[buf[start+5]] || IsDelim[buf[start+5]])) {
+        match = 4;
+        this.pos = start + 5;
+      }
+    }
+
+    if (match === 0) this._skipNameBody();
+    this.numNames++;
+    return match;
+  }
+
+  // After / is already skipped, check if name body equals an ASCII string.
+  // Does NOT move pos. Caller _skipNameBody afterwards.
+  _isNameAt(p, name) {
+    const buf = this.buf, len = this._len;
+    const n = name.length;
+    if (p + n > len) return false;
+    for (let i = 0; i < n; i++) {
+      if (buf[p + i] !== name.charCodeAt(i)) return false;
+    }
+    if (p + n === len) return true;
+    const after = buf[p + n];
+    return !!(IsWS[after] || IsDelim[after]);
+  }
+
+  // ---- Number / Ref ------------------------------------------------
+
+  // Parse a number-or-ref token starting at pos. Bumps numNumbers or
+  // numRefs as appropriate. Returns the integer value if it was a plain
+  // integer (for /Length capture); else NaN.
+  //
+  // PDF grammar: optional sign, optional digits, optional dot, optional
+  // digits. At least one digit required somewhere. No exponentials.
+  // So '.251', '-1.5', '+5', '5.', '5' are all valid.
+  parseNumberOrRefCapture() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    let sign = 1;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    let intDigits = 0, intVal = 0;
+    while (p < len && buf[p] >= D0 && buf[p] <= D9) {
+      intVal = intVal * 10 + (buf[p] - D0);
+      intDigits++; p++;
+    }
+    let hasDot = 0, fracDigits = 0;
+    if (p < len && buf[p] === DOT) {
+      hasDot = 1; p++;
+      while (p < len && buf[p] >= D0 && buf[p] <= D9) { fracDigits++; p++; }
+    }
+    if (intDigits === 0 && fracDigits === 0) {
+      throw new Error('expected number at ' + this.pos);
+    }
+    this.pos = p;
+    if (hasDot) {
+      this.numNumbers++;
+      return NaN;
+    }
+    // Pure integer: lookahead for ref "<sp> <int> <sp> R"
+    const save = this.pos;
+    this.skipWS();
+    if (this.pos < len && IsDigit[buf[this.pos]]) {
+      this._skipInt();
+      this.skipWS();
+      if (this.pos < len && buf[this.pos] === R_CH) {
+        this.pos++;
+        this.numRefs++;
+        return NaN;
+      }
+    }
+    this.pos = save;
+    this.numNumbers++;
+    return sign * intVal;
+  }
+
+  // ---- Object dispatch --------------------------------------------
+
+  parseObject() {
+    this.skipWS();
+    const buf = this.buf, len = this._len;
+    if (this.pos >= len) return;
+    const b = buf[this.pos];
+
+    // Keywords: true / false / null
+    if (b === t_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === r_ && buf[this.pos+2] === u_ && buf[this.pos+3] === e_) {
+        this.pos += 4; return;
+      }
+    } else if (b === f_) {
+      if (this.pos + 5 <= len &&
+          buf[this.pos+1] === a_ && buf[this.pos+2] === l_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === e_) {
+        this.pos += 5; return;
+      }
+    } else if (b === n_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === u_ && buf[this.pos+2] === l_ && buf[this.pos+3] === l_) {
+        this.pos += 4; return;
+      }
+    }
+
+    if (b === LT) {
+      if (buf[this.pos + 1] === LT) {
+        // Dict value: parse, then pop the frame (caller doesn't care)
+        const d = this._depth;
+        this.parseDict();
+        this._depth = d;
+        return;
+      }
+      this.skipHexString();
+      return;
+    }
+    if (b === LP) { this.skipString(); return; }
+    if (b === SLASH) { this.skipName(); return; }
+    if (b === LB) { this.parseArray(); return; }
+    if (IsNumeric[b]) { this.parseNumberOrRefCapture(); return; }
+
+    throw new Error(`parseObject: unexpected byte ${b} ('${String.fromCharCode(b)}') at ${this.pos}`);
+  }
+
+  // Parse << ... >>. Push frame on stack; do NOT decrement depth.
+  // Caller reads stack frame at index this._depth - 1 and decrements.
+  parseDict() {
+    const d = this._depth++;
+    if (d >= 64) throw new Error('dict depth overflow at ' + this.pos);
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+    this._stLength[d]  = -1;
+    this._stIsObjStm[d] = 0;
+    this._stN[d]      = -1;
+    this._stFirst[d]  = -1;
+
+    this.pos += 2; // skip <<
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len) {
+      if (buf[this.pos] === GT && buf[this.pos + 1] === GT) break;
+      if (buf[this.pos] !== SLASH) throw new Error('expected name at ' + this.pos);
+
+      const tag = this.matchDictKey();
+      this.skipWS();
+
+      if (tag === 1 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stLength[d] = v;
+      } else if (tag === 2 && buf[this.pos] === SLASH) {
+        // /Type value -- detect /ObjStm
+        if (this._isNameAt(this.pos + 1, 'ObjStm')) this._stIsObjStm[d] = 1;
+        this.pos++; // skip /
+        this._skipNameBody();
+        this.numNames++;
+      } else if (tag === 3 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stN[d] = v;
+      } else if (tag === 4 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stFirst[d] = v;
+      } else {
+        this.parseObject();
+      }
+      this.skipWS();
+      count++;
+    }
+    this.pos += 2; // skip >>
+
+    this.numDicts++;
+    this.numDictSlots += count * 2;
+    if (count * 2 > this.maxDictSlots) this.maxDictSlots = count * 2;
+    // Don't decrement _depth here -- caller reads frame then pops.
+  }
+
+  parseArray() {
+    const d = this._depth++;
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+
+    this.pos++; // skip [
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len && buf[this.pos] !== RB) {
+      this.parseObject();
+      this.skipWS();
+      count++;
+    }
+    this.pos++; // skip ]
+
+    this.numArrays++;
+    this.numArraySlots += count;
+    if (count > this.maxArraySlots) this.maxArraySlots = count;
+    this._depth--;
+  }
+
+  // ---- Indirect object + stream handling --------------------------
+
+  findEndStream(from) {
+    const buf = this.buf, len = this._len;
+    let p = from;
+    while (p + 9 <= len) {
+      if (buf[p] === e_ && buf[p+1] === n_ && buf[p+2] === d_ &&
+          buf[p+3] === s_ && buf[p+4] === t_ && buf[p+5] === r_ &&
+          buf[p+6] === e_ && buf[p+7] === a_ && buf[p+8] === m_) {
+        let end = p;
+        while (end > from && (buf[end-1] === LF || buf[end-1] === CR)) end--;
+        return end;
+      }
+      p++;
+    }
+    throw new Error('endstream not found from ' + from);
+  }
+
+  // Inflate an ObjStm and walk its inner objects.
+  processObjStm(start, end, N, first) {
+    const compressed = this.buf.subarray(start, end);
+    let inflated;
+    try {
+      inflated = inflateSync(compressed);
+    } catch (e) {
+      console.warn(`inflate failed at ${start}: ${e.message}`);
+      return;
+    }
+    this.totalInflatedBytes += inflated.length;
+    this.numObjStmInnerObjects += N;
+
+    // Grow offset arrays if needed
+    if (N > this._objOffsets.length) {
+      this._objOffsets = new Int32Array(N);
+      this._objNums = new Int32Array(N);
+    }
+
+    const saveBuf = this.buf, savePos = this.pos, saveLen = this._len;
+    this.buf = inflated;
+    this.pos = 0;
+    this._len = inflated.length;
+
+    // Read N (objNum, byteOffset) pairs
+    for (let i = 0; i < N; i++) {
+      this.skipWS();
+      this._objNums[i] = this._skipInt();
+      this.skipWS();
+      this._objOffsets[i] = this._skipInt();
+    }
+
+    // Walk each inner object
+    for (let i = 0; i < N; i++) {
+      this.pos = first + this._objOffsets[i];
+      const d0 = this._depth;
+      this.parseObject();
+      this._depth = d0; // safety pop
+    }
+
+    this.buf = saveBuf;
+    this.pos = savePos;
+    this._len = saveLen;
+  }
+
+  parseIndirectObject() {
+    this.skipWS();
+    this._skipInt(); // objNum
+    this.skipWS();
+    this._skipInt(); // gen
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    if (!(this.pos + 3 <= len && buf[this.pos] === o_ && buf[this.pos+1] === b_ && buf[this.pos+2] === j_)) {
+      throw new Error('expected "obj" at ' + this.pos);
+    }
+    this.pos += 3;
+    this.skipWS();
+    this.numIndirectObjects++;
+
+    // Parse the object body. If it's a dict, leave the frame on the
+    // stack so we can read /Length / /Type / /N / /First if a stream
+    // follows.
+    const frameDepth = this._depth;
+    let wasDict = false;
+    if (this.pos + 2 <= len && buf[this.pos] === LT && buf[this.pos+1] === LT) {
+      this.parseDict();
+      wasDict = true;
+    } else {
+      this.parseObject();
+    }
+    this.skipWS();
+
+    // Stream?
+    if (wasDict && this.pos + 6 <= len &&
+        buf[this.pos] === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === r_ &&
+        buf[this.pos+3] === e_ && buf[this.pos+4] === a_ && buf[this.pos+5] === m_) {
+      this.pos += 6;
+      // Optional CR/LF after 'stream'
+      if (this.pos < len && buf[this.pos] === CR) this.pos++;
+      if (this.pos < len && buf[this.pos] === LF) this.pos++;
+
+      const streamStart = this.pos;
+      const length    = this._stLength[frameDepth];
+      const isObjStm  = this._stIsObjStm[frameDepth];
+      const N         = this._stN[frameDepth];
+      const first     = this._stFirst[frameDepth];
+
+      let streamEnd;
+      if (length > 0) {
+        streamEnd = streamStart + length;
+        // Sanity: streamEnd should land near 'endstream'. If not, fallback.
+        if (streamEnd > len ||
+            !(buf[streamEnd] === LF || buf[streamEnd] === CR ||
+              buf[streamEnd] === e_ || IsWS[buf[streamEnd]])) {
+          streamEnd = this.findEndStream(streamStart);
+        }
+      } else {
+        streamEnd = this.findEndStream(streamStart);
+      }
+      this.pos = streamEnd;
+      this.totalStreamBytes += (streamEnd - streamStart);
+      this.numStreams++;
+
+      if (isObjStm && N > 0 && first > 0) {
+        this.numObjStms++;
+        this.processObjStm(streamStart, streamEnd, N, first);
+        this.pos = streamEnd; // restore (processObjStm restores too, defensive)
+      }
+
+      this.skipWS();
+      // Optional 'endstream' keyword (we already positioned past content)
+      if (this.pos + 9 <= len &&
+          buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === t_ && buf[this.pos+5] === r_ &&
+          buf[this.pos+6] === e_ && buf[this.pos+7] === a_ && buf[this.pos+8] === m_) {
+        this.pos += 9;
+      }
+      this.skipWS();
+    }
+
+    // Pop the dict frame
+    if (wasDict) this._depth = frameDepth;
+
+    // 'endobj' (lenient: tolerate missing)
+    this.skipWS();
+    if (this.pos + 6 <= len &&
+        buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+        buf[this.pos+3] === o_ && buf[this.pos+4] === b_ && buf[this.pos+5] === j_) {
+      this.pos += 6;
+    }
+  }
+
+  // ---- Top-level walk --------------------------------------------
+
+  walk() {
+    const buf = this.buf, len = this._len;
+
+    // Skip header line (%PDF-x.y), binary marker, etc.
+    // Strategy: scan forward until we see a digit followed by "<sp> <digit>+ <sp> obj"
+    // -- the first indirect-object header.
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (IsDigit[b]) {
+        // Try to validate this looks like an indirect-obj header
+        const save = this.pos;
+        this._skipInt();
+        if (buf[this.pos] === SP || buf[this.pos] === TAB) {
+          this.skipWS();
+          if (IsDigit[buf[this.pos]]) {
+            this._skipInt();
+            this.skipWS();
+            if (this.pos + 3 <= len && buf[this.pos] === o_ &&
+                buf[this.pos+1] === b_ && buf[this.pos+2] === j_) {
+              this.pos = save;
+              break;
+            }
+          }
+        }
+        this.pos = save + 1;
+      } else {
+        this.pos++;
+      }
+    }
+
+    // Walk indirect objects until xref / startxref / trailer
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (b === x_) break;            // xref
+      if (b === t_ && buf[this.pos+1] === r_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === 105 /* i */) break;  // trailer
+      if (b === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === r_ && buf[this.pos+4] === t_) break;  // startxref
+      if (!IsDigit[b]) break;
+      this.parseIndirectObject();
+    }
+  }
+}
+
+// ---- Main -----------------------------------------------------------
+
+function pickDefaultPdf() {
+  const dir = resolve('perf/results');
+  const entries = readdirSync(dir)
+    .filter(d => /^\d{4}-\d{2}-\d{2}T/.test(d))
+    .filter(d => statSync(join(dir, d)).isDirectory())
+    .sort();
+  for (let i = entries.length - 1; i >= 0; i--) {
+    const p = join(dir, entries[i], 'book.pdf');
+    try { statSync(p); return p; } catch (_) {}
+  }
+  throw new Error('no perf/results/*/book.pdf found; pass a path as argv[2]');
+}
+
+async function main() {
+  const args = process.argv.slice(2);
+  let inputPath = null;
+  let runs = 3;
+  let skipLoad = false;
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === '--runs') runs = parseInt(args[++i], 10);
+    else if (a === '--no-load') skipLoad = true;
+    else if (!inputPath) inputPath = a;
+  }
+  if (!inputPath) inputPath = pickDefaultPdf();
+  const buf = readFileSync(inputPath);
+  console.log(`input: ${inputPath}`);
+  console.log(`size:  ${(buf.length / 1024 / 1024).toFixed(2)} MB`);
+  console.log('');
+
+  // Measure pass
+  console.log(`--- measure pass (${runs} runs) ---`);
+  const measureTimes = [];
+  let lastM = null;
+  for (let i = 0; i < runs; i++) {
+    const m = new Measurer(buf);
+    const t0 = performance.now();
+    m.walk();
+    const ms = performance.now() - t0;
+    measureTimes.push(ms);
+    console.log(`  run ${i+1}: ${ms.toFixed(1)} ms`);
+    lastM = m;
+  }
+  const minMeasure = Math.min(...measureTimes);
+  console.log(`  min:   ${minMeasure.toFixed(1)} ms`);
+  console.log('');
+  console.log('counts (last run):');
+  console.log(`  indirect objects:    ${lastM.numIndirectObjects}`);
+  console.log(`  dicts:               ${lastM.numDicts}   slots: ${lastM.numDictSlots}   max: ${lastM.maxDictSlots}`);
+  console.log(`  arrays:              ${lastM.numArrays}   slots: ${lastM.numArraySlots}   max: ${lastM.maxArraySlots}`);
+  console.log(`  refs:                ${lastM.numRefs}`);
+  console.log(`  names:               ${lastM.numNames}`);
+  console.log(`  numbers:             ${lastM.numNumbers}`);
+  console.log(`  strings (literal):   ${lastM.numStrings}`);
+  console.log(`  strings (hex):       ${lastM.numHexStrings}`);
+  console.log(`  streams:             ${lastM.numStreams}   bytes: ${(lastM.totalStreamBytes/1024/1024).toFixed(2)} MB`);
+  console.log(`  objstms:             ${lastM.numObjStms}   inner objs: ${lastM.numObjStmInnerObjects}   inflated: ${(lastM.totalInflatedBytes/1024/1024).toFixed(2)} MB`);
+  console.log(`  max recursion:       ${lastM.maxRecursionDepth}`);
+  console.log('');
+
+  if (skipLoad) return;
+
+  // pdf-lib load (1 run only -- fast-dict-onebuf is singleton-context)
+  console.log(`--- PDFDocument.load (1 run; shim is singleton-context) ---`);
+  const t0 = performance.now();
+  await PDFDocument.load(buf);
+  const loadMs = performance.now() - t0;
+  console.log(`  load: ${loadMs.toFixed(1)} ms`);
+  console.log('');
+  console.log(`ratio measure(min)/load: ${(minMeasure / loadMs).toFixed(3)}  (lower = better)`);
+}
+
+main().catch(e => { console.error(e); process.exit(1); });