diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js
index 533bf298..8d212d9e 100644
--- a/docs/lib/paged.browser.js
+++ b/docs/lib/paged.browser.js
@@ -8,51 +8,30 @@
 	(global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.PagedPolyfill = factory());
 })(this, (function () { 'use strict';
 
-	function getBoundingClientRect(element) {
-		if (!element) {
-			return;
-		}
-		let rect;
-		if (typeof element.getBoundingClientRect !== "undefined") {
-			rect = element.getBoundingClientRect();
-		} else {
-			let range = document.createRange();
-			range.selectNode(element);
-			rect = range.getBoundingClientRect();
-		}
-		return rect;
-	}
-
-	function getClientRects(element) {
-		if (!element) {
-			return;
-		}
-		let rect;
-		if (typeof element.getClientRects !== "undefined") {
-			rect = element.getClientRects();
-		} else {
-			let range = document.createRange();
-			range.selectNode(element);
-			rect = range.getClientRects();
-		}
-		return rect;
-	}
-
 	/**
-	 * Generates a UUID
-	 * based on: http://stackoverflow.com/questions/105034/how-to-create-a-guid-uuid-in-javascript
-	 * @returns {string} uuid
+	 * Returns a unique-within-render id as a base36 string.
+	 * Replaced the prior RFC 4122 v4 UUID generator -- our pipeline only
+	 * needs uniqueness within a single render (data-ref attributes,
+	 * generated CSS variable / selector names, internal object identity),
+	 * not globally. Counter + base36 shaves the per-call cost from
+	 * ~3us (Date.now + per-char replace closure) to ~50ns and keeps IDs
+	 * short (max ~5 chars for the ~50k DOM nodes in a typical book).
+	 *
+	 * UUIDDecimal() below shares the same counter but returns the
+	 * decimal representation -- needed at addRefs (the data-ref
+	 * writer) so V8 auto-coerces the ref string to an integer index
+	 * when used against `source.indexOfRefs` (an Array). Base36 strings
+	 * like "1z" would force that array into dictionary mode; decimal
+	 * keeps it in PACKED_ELEMENTS, saving ~2-3 MB vs the previous
+	 * string-keyed dict. Every other UUID caller goes through UUID()
+	 * because their consumers don't index a JS array with the result.
 	 */
+	var __pagedjsCounter = 0;
 	function UUID() {
-		var d = new Date().getTime();
-		if (typeof performance !== "undefined" && typeof performance.now === "function") {
-			d += performance.now(); //use high-precision timer if available
-		}
-		return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function (c) {
-			var r = (d + Math.random() * 16) % 16 | 0;
-			d = Math.floor(d / 16);
-			return (c === "x" ? r : (r & 0x3 | 0x8)).toString(16);
-		});
+		return (++__pagedjsCounter).toString(36);
+	}
+	function UUIDDecimal() {
+		return (++__pagedjsCounter).toString();
 	}
 
 	function attr(element, attributes) {
@@ -169,8 +148,6 @@
 
 		this.reject = null;
 
-		this.id = UUID();
-
 		this.promise = new Promise((resolve, reject) => {
 			this.resolve = resolve;
 			this.reject = reject;
@@ -178,8 +155,6 @@
 		Object.freeze(this);
 	}
 
-	const requestIdleCallback = typeof window !== "undefined" && ("requestIdleCallback" in window ? window.requestIdleCallback : window.requestAnimationFrame);
-
 	function CSSValueToString(obj) {
 		return obj.value + (obj.unit || "");
 	}
@@ -903,28 +878,21 @@
 			return true;
 		}
 
+		// Cheap loop-detection key for the chunker's per-page Set.
+		// Common case (Element with its own data-ref): "ref|offset". Falls
+		// back to "parentRef|siblingIndex|offset" for Text/Comment nodes
+		// where the ref lives on the parent. The string is opaque to all
+		// callers other than chunker.flow's loop guard, so the format only
+		// needs to be unique-per-break-point, not human-readable.
 		toJSON(hash) {
-			let node;
-			let index = 0;
-			if (!this.node) {
-				return {};
-			}
+			if (!this.node) return "";
 			if (isElement(this.node) && this.node.dataset.ref) {
-				node = this.node.dataset.ref;
-			} else if (hash) {
-				node = this.node.parentElement.dataset.ref;
+				return this.node.dataset.ref + "|" + (this.offset || 0);
 			}
-
-			if (this.node.parentElement) {
-				const children = Array.from(this.node.parentElement.childNodes);
-				index = children.indexOf(this.node);
-			}
-
-			return JSON.stringify({
-				"node": node,
-				"index" : index,
-				"offset": this.offset
-			});
+			const parent = this.node.parentElement;
+			const parentRef = parent ? parent.dataset.ref : "";
+			const index = parent ? Array.prototype.indexOf.call(parent.childNodes, this.node) : 0;
+			return parentRef + "|" + index + "|" + (this.offset || 0);
 		}
 
 	}
@@ -1011,32 +979,6 @@
 		return true;
 	};
 
-	var isImplemented$7 = function () {
-		var assign = Object.assign, obj;
-		if (typeof assign !== "function") return false;
-		obj = { foo: "raz" };
-		assign(obj, { bar: "dwa" }, { trzy: "trzy" });
-		return obj.foo + obj.bar + obj.trzy === "razdwatrzy";
-	};
-
-	var isImplemented$6;
-	var hasRequiredIsImplemented$2;
-
-	function requireIsImplemented$2 () {
-		if (hasRequiredIsImplemented$2) return isImplemented$6;
-		hasRequiredIsImplemented$2 = 1;
-
-		isImplemented$6 = function () {
-			try {
-				Object.keys("primitive");
-				return true;
-			} catch (e) {
-				return false;
-			}
-		};
-		return isImplemented$6;
-	}
-
 	// eslint-disable-next-line no-empty-function
 	var noop$4 = function () {};
 
@@ -1044,32 +986,6 @@
 
 	var isValue$4 = function (val) { return val !== _undefined && val !== null; };
 
-	var shim$5;
-	var hasRequiredShim$5;
-
-	function requireShim$5 () {
-		if (hasRequiredShim$5) return shim$5;
-		hasRequiredShim$5 = 1;
-
-		var isValue = isValue$4;
-
-		var keys = Object.keys;
-
-		shim$5 = function (object) { return keys(isValue(object) ? Object(object) : object); };
-		return shim$5;
-	}
-
-	var keys;
-	var hasRequiredKeys;
-
-	function requireKeys () {
-		if (hasRequiredKeys) return keys;
-		hasRequiredKeys = 1;
-
-		keys = requireIsImplemented$2()() ? Object.keys : requireShim$5();
-		return keys;
-	}
-
 	var isValue$3 = isValue$4;
 
 	var validValue = function (value) {
@@ -1077,39 +993,6 @@
 		return value;
 	};
 
-	var shim$4;
-	var hasRequiredShim$4;
-
-	function requireShim$4 () {
-		if (hasRequiredShim$4) return shim$4;
-		hasRequiredShim$4 = 1;
-
-		var keys  = requireKeys()
-		  , value = validValue
-		  , max   = Math.max;
-
-		shim$4 = function (dest, src /*, …srcn*/) {
-			var error, i, length = max(arguments.length, 2), assign;
-			dest = Object(value(dest));
-			assign = function (key) {
-				try {
-					dest[key] = src[key];
-				} catch (e) {
-					if (!error) error = e;
-				}
-			};
-			for (i = 1; i < length; ++i) {
-				src = arguments[i];
-				keys(src).forEach(assign);
-			}
-			if (error !== undefined) throw error;
-			return dest;
-		};
-		return shim$4;
-	}
-
-	var assign$2 = isImplemented$7() ? Object.assign : requireShim$4();
-
 	var isValue$2 = isValue$4;
 
 	var forEach$1 = Array.prototype.forEach, create$5 = Object.create;
@@ -1129,35 +1012,9 @@
 		return result;
 	};
 
-	var str = "razdwatrzy";
-
-	var isImplemented$5 = function () {
-		if (typeof str.contains !== "function") return false;
-		return str.contains("dwa") === true && str.contains("foo") === false;
-	};
-
-	var shim$3;
-	var hasRequiredShim$3;
-
-	function requireShim$3 () {
-		if (hasRequiredShim$3) return shim$3;
-		hasRequiredShim$3 = 1;
-
-		var indexOf = String.prototype.indexOf;
-
-		shim$3 = function (searchString /*, position*/) {
-			return indexOf.call(this, searchString, arguments[1]) > -1;
-		};
-		return shim$3;
-	}
-
-	var contains$1 = isImplemented$5() ? String.prototype.contains : requireShim$3();
-
 	var isValue$1         = is$4
 	  , isPlainFunction = is
-	  , assign$1          = assign$2
-	  , normalizeOpts   = normalizeOptions
-	  , contains        = contains$1;
+	  , normalizeOpts   = normalizeOptions;
 
 	var d$1 = (d$2.exports = function (dscr, value/*, options*/) {
 		var c, e, w, options, desc;
@@ -1169,16 +1026,16 @@
 			options = arguments[2];
 		}
 		if (isValue$1(dscr)) {
-			c = contains.call(dscr, "c");
-			e = contains.call(dscr, "e");
-			w = contains.call(dscr, "w");
+			c = dscr.includes("c");
+			e = dscr.includes("e");
+			w = dscr.includes("w");
 		} else {
 			c = w = true;
 			e = false;
 		}
 
 		desc = { value: value, configurable: c, enumerable: e, writable: w };
-		return !options ? desc : assign$1(normalizeOpts(options), desc);
+		return !options ? desc : Object.assign(normalizeOpts(options), desc);
 	});
 
 	d$1.gs = function (dscr, get, set/*, options*/) {
@@ -1203,15 +1060,15 @@
 			set = undefined;
 		}
 		if (isValue$1(dscr)) {
-			c = contains.call(dscr, "c");
-			e = contains.call(dscr, "e");
+			c = dscr.includes("c");
+			e = dscr.includes("e");
 		} else {
 			c = true;
 			e = false;
 		}
 
 		desc = { get: get, set: set, configurable: c, enumerable: e };
-		return !options ? desc : assign$1(normalizeOpts(options), desc);
+		return !options ? desc : Object.assign(normalizeOpts(options), desc);
 	};
 
 	var dExports = d$2.exports;
@@ -1366,6 +1223,25 @@
 	 * @param {any} context scope of this
 	 * @example this.content = new Hook(this);
 	 */
+	// [PATCH: sync-chain] Used by the chunker hot path to confirm that
+	// Hook.trigger() returned the sync sentinel (undefined). If a handler
+	// returned a thenable, the chunker dropping it here would silently
+	// lose async work -- so we throw instead. Limitation of this fork:
+	// the per-page hooks (beforePageLayout / afterPageLayout /
+	// finalizePage / handleBreaks / *layout / page.layout etc.) must
+	// have all-synchronous handlers. Bundle ships with no async handlers
+	// for these on our pipeline; document and assert.
+	function _assertSync(triggerResult, hookName) {
+		if (triggerResult && typeof triggerResult.then === "function") {
+			throw new Error(
+				"paged.js (forked): async handler registered for hook '" + hookName + "'. " +
+				"This bundle's per-page hot path is synchronous; async handlers " +
+				"must be registered for the once-per-render hooks (beforeParsed, " +
+				"afterParsed, afterRendered) instead, or the chain re-asyncified."
+			);
+		}
+	}
+
 	class Hook {
 		constructor(context){
 			this.context = context || this;
@@ -1391,39 +1267,61 @@
 		}
 
 		/**
-		 * Triggers a hook to run all functions
-		 * @example this.content.trigger(args).then(function(){...});
-		 * @return {Promise} results
+		 * Triggers a hook to run all functions.
+		 * @return {Promise|undefined} A Promise that resolves when all
+		 *   thenable-returning handlers settle, OR `undefined` if no
+		 *   handler returned a thenable (the all-synchronous fast path).
+		 *
+		 * [PATCH: hook-fast-path] Upstream always wrapped sync handler
+		 * results in `new Promise(resolve => resolve(...))` and returned
+		 * `Promise.all(promises)`, so callers' `await trigger(...)` was a
+		 * mandatory microtask boundary even when every handler resolved
+		 * synchronously. We return `undefined` on the all-sync path so
+		 * callers can write:
+		 *
+		 *   let p = hook.trigger(...);
+		 *   if (p) await p;
+		 *
+		 * and skip the microtask boundary entirely. Per-page hot-loop
+		 * sites in the chunker do this; one-shot callers can keep the
+		 * `await trigger(...)` form (`await undefined` still works, just
+		 * with a cycle).
 		 */
 		trigger(){
 			var args = arguments;
 			var context = this.context;
-			var promises = [];
-
-			this.hooks.forEach(function(task) {
-				var executing = task.apply(context, args);
+			var promises;
 
-				if(executing && typeof executing["then"] === "function") {
-					// Task is a function that returns a promise
-					promises.push(executing);
-				} else {
-					// Otherwise Task resolves immediately, add resolved promise with result
-					promises.push(new Promise((resolve, reject) => {
-						resolve(executing);
-					}));
+			for (var i = 0; i < this.hooks.length; i++) {
+				var executing = this.hooks[i].apply(context, args);
+				if (executing && typeof executing["then"] === "function") {
+					(promises = promises || []).push(executing);
 				}
-			});
-
+			}
 
-			return Promise.all(promises);
+			return promises ? Promise.all(promises) : undefined;
 		}
 
 		/**
-	   * Triggers a hook to run all functions synchronously
-	   * @example this.content.trigger(args).then(function(){...});
-	   * @return {Array} results
+	   * Triggers a hook to run all functions synchronously.
+	   * @return {Array|undefined} results array, or undefined when no
+	   *   handlers are registered (callers can skip their reducer
+	   *   forEach with a simple truthy check).
+	   *
+	   * [PATCH: hook-fast-path-sync] Mirrors the async `trigger()` patch:
+	   * skip the results-array alloc and the empty-forEach indirection
+	   * when this.hooks is empty. In the per-page hot path, onOverflow
+	   * and onBreakToken have zero registered handlers in this build,
+	   * so every call to those two hooks via triggerSync was pure
+	   * dispatch overhead -- ~3300 calls per render on the 1650-page
+	   * book. Callers in those reducer sites now read:
+	   *
+	   *   let r = hook.triggerSync(...);
+	   *   if (r) r.forEach(...);
 	   */
 		triggerSync(){
+			if (this.hooks.length === 0) return undefined;
+
 			var args = arguments;
 			var context = this.context;
 			var results = [];
@@ -1488,10 +1386,35 @@
 			this.settings = options || {};
 
 			this.maxChars = this.settings.maxChars || MAX_CHARS_PER_BREAK;
+
+			// [PATCH: parent-lookup-cache] One-entry memo of the last
+			// (sourceParent, dest) -> destParent resolution from append().
+			// Consecutive siblings in the source tree all resolve to the
+			// same destParent, so caching the previous result lets the
+			// per-call findElement / indexOfRefs lookup short-circuit.
+			// Invalidated at the start of every renderTo; safe within a
+			// single renderTo loop because append() never detaches DOM
+			// from dest (removeOverflow only fires after the loop exits).
+			this._lastSrcParent = null;
+			this._lastDest = null;
+			this._lastDestParent = null;
 			this.forceRenderBreak = false;
 		}
 
-		async renderTo(wrapper, source, breakToken, bounds = this.bounds) {
+		// [PATCH: sync-chain] renderTo no longer needs to be async because
+		// waitForImages is now sync (see its comment). Removing `async`
+		// removes the per-page Promise allocation that was returned from
+		// page.layout / chunker.layout up the chain.
+		renderTo(wrapper, source, breakToken, bounds = this.bounds) {
+			// [PATCH: parent-lookup-cache] Invalidate the per-Layout
+			// parent memo. The previous renderTo on this Layout instance
+			// (same Page, multiple renderTo calls) may have run
+			// findBreakToken -> removeOverflow at exit, leaving the
+			// cached destParent detached.
+			this._lastSrcParent = null;
+			this._lastDest = null;
+			this._lastDestParent = null;
+
 			let start = this.getStart(source, breakToken);
 			let walker = walk$2(start, source);
 
@@ -1518,7 +1441,7 @@
 
 			let prevBreakToken = breakToken || new BreakToken(start);
 
-			this.hooks && this.hooks.onPageLayout.trigger(wrapper, prevBreakToken, this);
+			if (this.hooks) _assertSync(this.hooks.onPageLayout.trigger(wrapper, prevBreakToken, this), "onPageLayout");
 
 			while (!done && !newBreakToken) {
 				next = walker.next();
@@ -1527,36 +1450,36 @@
 				done = next.done;
 
 				if (!node) {
-					this.hooks && this.hooks.layout.trigger(wrapper, this);
+					if (this.hooks) _assertSync(this.hooks.layout.trigger(wrapper, this), "layout");
 
 					let imgs = wrapper.querySelectorAll("img");
 					if (imgs.length) {
-						await this.waitForImages(imgs);
+						this.waitForImages(imgs);
 					}
 
 					newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
 
 					if (newBreakToken && newBreakToken.equals(prevBreakToken)) {
 						console.warn("Unable to layout item: ", prevNode);
-						this.hooks && this.hooks.beforeRenderResult.trigger(undefined, wrapper, this);
+						if (this.hooks) _assertSync(this.hooks.beforeRenderResult.trigger(undefined, wrapper, this), "beforeRenderResult");
 						return new RenderResult(undefined, new OverflowContentError("Unable to layout item", [prevNode]));
 					}
 
 					this.rebuildTableFromBreakToken(newBreakToken, wrapper);
 
-					this.hooks && this.hooks.beforeRenderResult.trigger(newBreakToken, wrapper, this);
+					if (this.hooks) _assertSync(this.hooks.beforeRenderResult.trigger(newBreakToken, wrapper, this), "beforeRenderResult");
 					return new RenderResult(newBreakToken);
 				}
 
-				this.hooks && this.hooks.layoutNode.trigger(node);
+				if (this.hooks) _assertSync(this.hooks.layoutNode.trigger(node), "layoutNode");
 
 				// Check if the rendered element has a break set
 				if (hasRenderedContent && this.shouldBreak(node, start)) {
-					this.hooks && this.hooks.layout.trigger(wrapper, this);
+					if (this.hooks) _assertSync(this.hooks.layout.trigger(wrapper, this), "layout");
 
 					let imgs = wrapper.querySelectorAll("img");
 					if (imgs.length) {
-						await this.waitForImages(imgs);
+						this.waitForImages(imgs);
 					}
 
 					newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
@@ -1611,7 +1534,7 @@
 				}
 
 				if (this.forceRenderBreak) {
-					this.hooks && this.hooks.layout.trigger(wrapper, this);
+					if (this.hooks) _assertSync(this.hooks.layout.trigger(wrapper, this), "layout");
 
 					newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
 
@@ -1630,11 +1553,11 @@
 				// Only check overflow once per maxChars of new content.
 				if (length - lengthAtLastCheck >= this.maxChars) {
 
-					this.hooks && this.hooks.layout.trigger(wrapper, this);
+					if (this.hooks) _assertSync(this.hooks.layout.trigger(wrapper, this), "layout");
 
 					let imgs = wrapper.querySelectorAll("img");
 					if (imgs.length) {
-						await this.waitForImages(imgs);
+						this.waitForImages(imgs);
 					}
 
 					newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
@@ -1653,7 +1576,7 @@
 						if (after) {
 							newBreakToken = new BreakToken(after);
 						} else {
-							this.hooks && this.hooks.beforeRenderResult.trigger(undefined, wrapper, this);
+							if (this.hooks) _assertSync(this.hooks.beforeRenderResult.trigger(undefined, wrapper, this), "beforeRenderResult");
 							return new RenderResult(undefined, new OverflowContentError("Unable to layout item", [node]));
 						}
 					}
@@ -1661,7 +1584,7 @@
 
 			}
 
-			this.hooks && this.hooks.beforeRenderResult.trigger(newBreakToken, wrapper, this);
+			if (this.hooks) _assertSync(this.hooks.beforeRenderResult.trigger(newBreakToken, wrapper, this), "beforeRenderResult");
 			return new RenderResult(newBreakToken);
 		}
 
@@ -1671,7 +1594,7 @@
 				offset
 			);
 			let breakHooks = this.hooks.onBreakToken.triggerSync(newBreakToken, undefined, node, this);
-			breakHooks.forEach((newToken) => {
+			if (breakHooks) breakHooks.forEach((newToken) => {
 				if (typeof newToken != "undefined") {
 					newBreakToken = newToken;
 				}
@@ -1715,7 +1638,16 @@
 			let clone = cloneNode(node, !shallow);
 
 			if (node.parentNode && isElement(node.parentNode)) {
-				let parent = findElement(node.parentNode, dest);
+				const srcParent = node.parentNode;
+				// [PATCH: parent-lookup-cache] Consecutive sibling appends
+				// share the same source parent; reuse the prior result
+				// instead of walking dest.indexOfRefs again.
+				let parent;
+				if (srcParent === this._lastSrcParent && dest === this._lastDest) {
+					parent = this._lastDestParent;
+				} else {
+					parent = findElement(srcParent, dest);
+				}
 				// Rebuild chain
 				if (parent) {
 					parent.appendChild(clone);
@@ -1747,20 +1679,38 @@
 					dest.appendChild(clone);
 				}
 
+				// [PATCH: parent-lookup-cache] Cache the resolved (or
+				// rebuilt-and-attached) parent so the next sibling can
+				// skip the lookup. Skip on the no-rebuild fall-through
+				// where parent stayed null -- a later call with the same
+				// srcParent should still attempt the lookup.
+				if (parent) {
+					this._lastSrcParent = srcParent;
+					this._lastDest = dest;
+					this._lastDestParent = parent;
+				}
 
 			} else {
 				dest.appendChild(clone);
 			}
 
-			if (clone.dataset && clone.dataset.ref) {
+			// [PATCH: append-ref-local] Cache clone.dataset.ref in a
+			// local. Each .ref access goes through getAttribute and
+			// allocates a fresh JS string; the existence check + dict
+			// write were two reads of the same value. Saves one string
+			// allocation per ~50k append calls on the book (~1.5 MB
+			// heap per paired heap-sampling A/B at 512 B sampling).
+			// Same shape as PATCH: addRefs-uuid-local above.
+			const ref = clone.dataset && clone.dataset.ref;
+			if (ref) {
 				if (!dest.indexOfRefs) {
 					dest.indexOfRefs = {};
 				}
-				dest.indexOfRefs[clone.dataset.ref] = clone;
+				dest.indexOfRefs[ref] = clone;
 			}
 
 			let nodeHooks = this.hooks.renderNode.triggerSync(clone, node, this);
-			nodeHooks.forEach((newNode) => {
+			if (nodeHooks) nodeHooks.forEach((newNode) => {
 				if (typeof newNode != "undefined") {
 					clone = newNode;
 				}
@@ -1786,29 +1736,27 @@
 			}
 		}
 
-		async waitForImages(imgs) {
-			let results = Array.from(imgs).map(async (img) => {
-				return this.awaitImageLoaded(img);
-			});
-			await Promise.all(results);
-		}
-
-		async awaitImageLoaded(image) {
-			return new Promise(resolve => {
-				if (image.complete !== true) {
-					image.onload = function () {
-						let {width, height} = window.getComputedStyle(image);
-						resolve(width, height);
-					};
-					image.onerror = function (e) {
-						let {width, height} = window.getComputedStyle(image);
-						resolve(width, height, e);
-					};
-				} else {
-					let {width, height} = window.getComputedStyle(image);
-					resolve(width, height);
+		// [PATCH: sync-chain] waitForImages used to wrap every image in
+		// `new Promise(resolve => ...)` and await `Promise.all(...)`, so
+		// `renderTo` was forced to be async even when every image was
+		// already loaded (which is our case -- page.goto(url, {
+		// waitUntil: "load" }) settles before paged.js starts rendering).
+		//
+		// In our headless pipeline image.complete is always true at this
+		// point. If a future caller hits this with a not-yet-loaded
+		// image, that's a pipeline bug and we throw immediately rather
+		// than silently making the rest of the layout chain async again.
+		waitForImages(imgs) {
+			for (const img of imgs) {
+				if (img.complete !== true) {
+					throw new Error(
+						"paged.js (forked): image not loaded at render time. " +
+						"This branch dropped async image-loading support; the " +
+						"render pipeline must finish loading all images before " +
+						"calling paged.js. Image: " + (img.src || img.outerHTML)
+					);
 				}
-			});
+			}
 		}
 
 		avoidBreakInside(node, limiter) {
@@ -1927,7 +1875,7 @@
 			let breakToken, breakLetter;
 
 			let overflowHooks = this.hooks.onOverflow.triggerSync(overflow, rendered, bounds, this);
-			overflowHooks.forEach((newOverflow) => {
+			if (overflowHooks) overflowHooks.forEach((newOverflow) => {
 				if (typeof newOverflow != "undefined") {
 					overflow = newOverflow;
 				}
@@ -1937,7 +1885,7 @@
 				breakToken = this.createBreakToken(overflow, rendered, source);
 				// breakToken is nullable
 				let breakHooks = this.hooks.onBreakToken.triggerSync(breakToken, overflow, rendered, this);
-				breakHooks.forEach((newToken) => {
+				if (breakHooks) breakHooks.forEach((newToken) => {
 					if (typeof newToken != "undefined") {
 						breakToken = newToken;
 					}
@@ -1956,7 +1904,12 @@
 
 				if (breakToken && breakToken.node && extract) {
 					let removed = this.removeOverflow(overflow, breakLetter);
-					this.hooks && this.hooks.afterOverflowRemoved.trigger(removed, rendered, this);
+					// [PATCH: assert-sync] Guard against silent async-handler
+					// drop. Upstream fired the trigger without `await`, so any
+					// async handler's work would have been lost. _assertSync
+					// throws instead if a handler returns a thenable -- the
+					// fork's per-page hot path is synchronous, see Hook.trigger.
+					if (this.hooks) _assertSync(this.hooks.afterOverflowRemoved.trigger(removed, rendered, this), "afterOverflowRemoved");
 				}
 
 			}
@@ -1995,7 +1948,14 @@
 				br = undefined;
 
 				if (node) {
-					let pos = getBoundingClientRect(node);
+					let pos;
+					if (node.nodeType === 1) {
+						pos = node.getBoundingClientRect();
+					} else {
+						let range = document.createRange();
+						range.selectNode(node);
+						pos = range.getBoundingClientRect();
+					}
 					let left = Math.round(pos.left);
 					let right = Math.floor(pos.right);
 					let top = Math.round(pos.top);
@@ -2092,7 +2052,9 @@
 						node.textContent.trim().length &&
 						!breakInsideAvoidParentNode(node.parentNode)) {
 
-						let rects = getClientRects(node);
+						let textRange = document.createRange();
+						textRange.selectNode(node);
+						let rects = textRange.getClientRects();
 						let rect;
 						left = 0;
 						top = 0;
@@ -2199,7 +2161,7 @@
 					break;
 				}
 
-				pos = getBoundingClientRect(word);
+				pos = word.getBoundingClientRect();
 
 				left = Math.floor(pos.left);
 				right = Math.floor(pos.right);
@@ -2224,7 +2186,7 @@
 							break;
 						}
 
-						pos = getBoundingClientRect(letter);
+						pos = letter.getBoundingClientRect();
 						left = Math.floor(pos.left);
 						top = Math.floor(pos.top);
 
@@ -2244,7 +2206,25 @@
 
 		removeOverflow(overflow, breakLetter) {
 			let {startContainer} = overflow;
-			let extracted = overflow.extractContents();
+
+			// [PATCH: extract-vs-delete] Range.extractContents() builds a
+			// DocumentFragment of the removed nodes and reattaches them;
+			// Range.deleteContents() just removes. The only consumer of
+			// the returned fragment is Footnotes.afterOverflowRemoved,
+			// which iterates the rendered area's footnotes and for each
+			// looks up its [data-footnote-call=...] in the removed fragment.
+			// So extractContents is only useful if the rendered area
+			// contained any footnote-call elements. Check via a cheap
+			// querySelector on `this.element` (the page content area --
+			// `.pagedjs_page_content`); when no calls are present we
+			// skip the fragment build entirely.
+			let extracted;
+			if (this.element && this.element.querySelector("[data-footnote-call]")) {
+				extracted = overflow.extractContents();
+			} else {
+				overflow.deleteContents();
+				extracted = null;
+			}
 
 			this.hyphenateAtBreak(startContainer, breakLetter);
 
@@ -2403,20 +2383,33 @@
 		}
 		*/
 
-		async layout(contents, breakToken, maxChars) {
+		// [PATCH: sync-chain] page.layout / append no longer await
+		// renderTo (which is now sync). Removing `async` removes the
+		// Promise allocation around each return.
+		layout(contents, breakToken, maxChars) {
 
 			this.clear();
 
 			this.startToken = breakToken;
 
 			let settings = this.settings;
-			if (!settings.maxChars && maxChars) {
+			// [PATCH: maxChars-propagate] Upstream gated this on
+			// `!settings.maxChars`, which froze the chunker's running
+			// estimate at whatever value the first non-empty page produced.
+			// On a book whose first real page happens to be short, the
+			// estimate locked in tiny (e.g. 177 chars) and every later page
+			// fell back to checking overflow every 177 chars of new
+			// content -- ~5 hasOverflow / gBCR layout flushes per page on
+			// average, where 1-2 would suffice. Always propagate so each
+			// page picks up the most recent estimate; the chunker's
+			// recordCharLength still drives that value.
+			if (maxChars) {
 				settings.maxChars = maxChars;
 			}
 
 			this.layoutMethod = new Layout(this.area, this.hooks, settings);
 
-			let renderResult = await this.layoutMethod.renderTo(this.wrapper, contents, breakToken);
+			let renderResult = this.layoutMethod.renderTo(this.wrapper, contents, breakToken);
 			let newBreakToken = renderResult.breakToken;
 
 			this.addListeners(contents);
@@ -2426,13 +2419,13 @@
 			return newBreakToken;
 		}
 
-		async append(contents, breakToken) {
+		append(contents, breakToken) {
 
 			if (!this.layoutMethod) {
 				return this.layout(contents, breakToken);
 			}
 
-			let renderResult = await this.layoutMethod.renderTo(this.wrapper, contents, breakToken);
+			let renderResult = this.layoutMethod.renderTo(this.wrapper, contents, breakToken);
 			let newBreakToken = renderResult.breakToken;
 
 			this.endToken = newBreakToken;
@@ -2465,14 +2458,7 @@
 		}
 
 		addListeners(contents) {
-			if (typeof ResizeObserver !== "undefined") {
-				this.addResizeObserver(contents);
-			} else {
-				this._checkOverflowAfterResize = this.checkOverflowAfterResize.bind(this, contents);
-				this.element.addEventListener("overflow", this._checkOverflowAfterResize, false);
-				this.element.addEventListener("underflow", this._checkOverflowAfterResize, false);
-			}
-			// TODO: fall back to mutation observer?
+			this.addResizeObserver(contents);
 
 			this._onScroll = function () {
 				if (this.listening) {
@@ -2491,11 +2477,8 @@
 		removeListeners() {
 			this.listening = false;
 
-			if (typeof ResizeObserver !== "undefined" && this.ro) {
+			if (this.ro) {
 				this.ro.disconnect();
-			} else if (this.element) {
-				this.element.removeEventListener("overflow", this._checkOverflowAfterResize, false);
-				this.element.removeEventListener("underflow", this._checkOverflowAfterResize, false);
 			}
 
 			this.element && this.element.removeEventListener("scroll", this._onScroll);
@@ -2620,13 +2603,40 @@
 			// which scans the entire source DOM (thousands of nodes). Measured
 			// as 848 + 42 noDict calls in createBreakToken ≈ 1+ s of render on
 			// the 1651-page book.
-			if (!content.indexOfRefs) content.indexOfRefs = {};
+			//
+			// [PATCH: source-indexOfRefs-array] Use an Array (dense, sequential
+			// integer keys via the decimal UUID counter -- see UUID()) instead
+			// of a dict. V8 stores it as PACKED_ELEMENTS: ~8 B per slot vs
+			// ~40-50 B per dict entry. dest/fragment.indexOfRefs (sparse) stay
+			// dicts at their own init sites. `findRef` does `arr[ref]` either
+			// way -- V8 coerces the decimal-string ref to an array index
+			// transparently, so no caller-side branch is needed.
+			//
+			// [PATCH: source-indexOfRefs-presize] Size the array up front
+			// from the live HTMLCollection's .length. V8 grows arrays
+			// geometrically -- writing slots 1..N via doubling does
+			// log2(N) backing-store reallocations, each allocating the
+			// new store and orphaning the old (transient bytes ~= 2x the
+			// final size). Pre-sizing skips all of that.
+			if (!content.indexOfRefs) {
+				const elementCount = content.getElementsByTagName ? content.getElementsByTagName("*").length : 0;
+				content.indexOfRefs = new Array(elementCount + 1);
+			}
 
 			let node = treeWalker.nextNode();
 			while(node) {
 
-				if (!node.hasAttribute("data-ref")) {
-					let uuid = UUID();
+				// [PATCH: addRefs-uuid-local] Read data-ref once via
+				// getAttribute (null-tested as the existence check),
+				// reuse the local string for indexOfRefs. Previously
+				// hasAttribute + setAttribute + getAttribute on the
+				// new-uuid branch caused one extra DOM read and one
+				// duplicate string allocation per ~50k source nodes
+				// (~460 KB heap on the book per paired heap-sampling
+				// A/B at 4 KB sampling).
+				let uuid = node.getAttribute("data-ref");
+				if (!uuid) {
+					uuid = UUIDDecimal();
 					node.setAttribute("data-ref", uuid);
 				}
 
@@ -2638,8 +2648,7 @@
 
 				// node.setAttribute("data-text", node.textContent.trim().length);
 
-				// [PATCH: findRef fast-path] record after data-ref is guaranteed.
-				content.indexOfRefs[node.getAttribute("data-ref")] = node;
+				content.indexOfRefs[uuid] = node;
 
 				node = treeWalker.nextNode();
 			}
@@ -2664,7 +2673,13 @@
 		constructor(context){
 			this._q = [];
 			this.context = context;
-			this.tick = requestAnimationFrame;
+			// [PATCH: queue-tick] Upstream uses requestAnimationFrame as the
+			// per-task tick, which on a headless puppeteer render still waits
+			// per frame even with no compositor. Across 1651 pages that's
+			// ~700 ms of V8 (idle). queueMicrotask schedules on the microtask
+			// queue and fires before the next event-loop iteration, dropping
+			// the per-page wait to microsecond-scale.
+			this.tick = (cb) => queueMicrotask(cb);
 			this.running = false;
 			this.paused = false;
 		}
@@ -3064,14 +3079,20 @@
 		// 	}
 		// }
 
+		// [PATCH: sync-chain] *layout is a sync generator now, so
+		// renderer.next() returns synchronously -- no per-page await.
+		// render() itself stays `async` because callers (flow()) await
+		// it and other once-per-render awaits in flow() (loadFonts,
+		// beforeParsed / afterParsed / afterRendered) still need it.
 		async render(parsed, startAt) {
 			let renderer = this.layout(parsed, startAt);
 
-			let done = false;
 			let result;
-			while (!done) {
-				result = await this.q.enqueue(() => { return this.renderAsync(renderer); });
-				done = result.done;
+			while (true) {
+				if (this.stopped) return { done: true, canceled: true };
+				result = renderer.next();
+				if (this.stopped) return { done: true, canceled: true };
+				if (result.done) break;
 			}
 
 			return result;
@@ -3087,35 +3108,18 @@
 			// this.q.clear();
 		}
 
-		renderOnIdle(renderer) {
-			return new Promise(resolve => {
-				requestIdleCallback(async () => {
-					if (this.stopped) {
-						return resolve({ done: true, canceled: true });
-					}
-					let result = await renderer.next();
-					if (this.stopped) {
-						resolve({ done: true, canceled: true });
-					} else {
-						resolve(result);
-					}
-				});
-			});
-		}
-
-		async renderAsync(renderer) {
-			if (this.stopped) {
-				return { done: true, canceled: true };
-			}
-			let result = await renderer.next();
-			if (this.stopped) {
-				return { done: true, canceled: true };
-			} else {
-				return result;
-			}
-		}
+		// [PATCH: sync-chain] renderOnIdle and renderAsync removed --
+		// both wrapped renderer.next() (now sync) in async machinery,
+		// and the only caller (render() via this.q.enqueue) was already
+		// removed in the drop-queue change.
 
-		async handleBreaks(node, force) {
+		// [PATCH: sync-chain] handleBreaks no longer awaits hook triggers
+		// (Hook.trigger returns undefined on the all-sync path, which is
+		// our only path). If a future caller registers an async handler
+		// for any of these hooks, Hook.trigger will return a Promise and
+		// dropping it here will silently lose the work -- we assert that
+		// instead. The `_assertSync` helper lives below.
+		handleBreaks(node, force) {
 			let currentPage = this.total + 1;
 			let currentPosition = currentPage % 2 === 0 ? "left" : "right";
 			// TODO: Recto and Verso should reverse for rtl languages
@@ -3161,52 +3165,62 @@
 			}
 
 			if (page) {
-				await this.hooks.beforePageLayout.trigger(page, undefined, undefined, this);
+				_assertSync(this.hooks.beforePageLayout.trigger(page, undefined, undefined, this), "beforePageLayout");
 				this.emit("page", page);
-				// await this.hooks.layout.trigger(page.element, page, undefined, this);
-				await this.hooks.afterPageLayout.trigger(page.element, page, undefined, this);
-				await this.hooks.finalizePage.trigger(page.element, page, undefined, this);
+				_assertSync(this.hooks.afterPageLayout.trigger(page.element, page, undefined, this), "afterPageLayout");
+				_assertSync(this.hooks.finalizePage.trigger(page.element, page, undefined, this), "finalizePage");
 				this.emit("renderedPage", page);
 			}
 		}
 
-		async *layout(content, startAt) {
+		// [PATCH: sync-chain] *layout is now a sync generator, not an
+		// async generator. With handleBreaks, page.layout, renderTo, and
+		// every per-page hook trigger all synchronous in our pipeline,
+		// nothing inside this generator needs to await. The sync form
+		// avoids ~1651 Promise allocations per render (one per
+		// `renderer.next()` call) and the matching microtask boundaries.
+		*layout(content, startAt) {
 			let breakToken = startAt || false;
-			let tokens = [];
+			// [PATCH: tokens-set] Loop-detection used `tokens.lastIndexOf(...)`
+			// on an array, which scans up to N entries per page -- O(n^2)
+			// across a render. A Set gives O(1) lookup. The absolute saving
+			// on our 1651-page book is small (~80 us per late page) but the
+			// algorithmic shape is the load-bearing change.
+			let tokens = new Set();
 
 			while (breakToken !== undefined && (true)) {
 
 				if (breakToken && breakToken.node) {
-					await this.handleBreaks(breakToken.node);
+					this.handleBreaks(breakToken.node);
 				} else {
-					await this.handleBreaks(content.firstChild);
+					this.handleBreaks(content.firstChild);
 				}
 
 				let page = this.addPage();
 
-				await this.hooks.beforePageLayout.trigger(page, content, breakToken, this);
+				_assertSync(this.hooks.beforePageLayout.trigger(page, content, breakToken, this), "beforePageLayout");
 				this.emit("page", page);
 
 				// Layout content in the page, starting from the breakToken
-				breakToken = await page.layout(content, breakToken, this.maxChars);
+				breakToken = page.layout(content, breakToken, this.maxChars);
 
 				if (breakToken) {
 					let newToken = breakToken.toJSON(true);
-					if (tokens.lastIndexOf(newToken) > -1) {
+					if (tokens.has(newToken)) {
 						// loop
 						let err = new OverflowContentError("Layout repeated", [breakToken.node]);
 						console.error("Layout repeated at: ", breakToken.node);
 						return err;
 					} else {
-						tokens.push(newToken);
+						tokens.add(newToken);
 					}
 				}
 
-				await this.hooks.afterPageLayout.trigger(page.element, page, breakToken, this);
-				await this.hooks.finalizePage.trigger(page.element, page, undefined, this);
+				_assertSync(this.hooks.afterPageLayout.trigger(page.element, page, breakToken, this), "afterPageLayout");
+				_assertSync(this.hooks.finalizePage.trigger(page.element, page, undefined, this), "finalizePage");
 				this.emit("renderedPage", page);
 
-				this.recoredCharLength(page.wrapper.textContent.length);
+				this.recordCharLength(page.wrapper.textContent.length);
 
 				yield breakToken;
 
@@ -3216,19 +3230,33 @@
 
 		}
 
-		recoredCharLength(length) {
+		recordCharLength(length) {
 			if (length === 0) {
 				return;
 			}
 
 			this.charsPerBreak.push(length);
 
-			// Keep the length of the last few breaks
-			if (this.charsPerBreak.length > 4) {
+			// [PATCH: maxChars-running-max] Upstream tracked the running
+			// average over the last 4 page text-content lengths and used
+			// it as `maxChars`, the renderTo overflow-check period.
+			// Average is the wrong statistic: short pages (chapter ends,
+			// part dividers) get recorded alongside full pages, dragging
+			// the estimate well below true page capacity. The check then
+			// fires several times per full page when one call would have
+			// sufficed -- each call is a hasOverflow / gBCR layout flush.
+			// The running max over a wider window biases toward true
+			// capacity (the largest page recently seen), so overflow
+			// pages typically resolve in a single check.
+			if (this.charsPerBreak.length > 16) {
 				this.charsPerBreak.shift();
 			}
 
-			this.maxChars = this.charsPerBreak.reduce((a, b) => a + b, 0) / (this.charsPerBreak.length);
+			let m = 0;
+			for (let i = 0; i < this.charsPerBreak.length; i++) {
+				if (this.charsPerBreak[i] > m) m = this.charsPerBreak[i];
+			}
+			this.maxChars = m;
 		}
 
 		removePages(fromIndex=0) {
@@ -5199,11 +5227,10 @@
 	};
 
 	var MIN_SIZE = 16 * 1024;
-	var SafeUint32Array = typeof Uint32Array !== 'undefined' ? Uint32Array : Array; // fallback on Array when TypedArray is not supported
 
 	var adoptBuffer$2 = function adoptBuffer(buffer, size) {
 	    if (buffer === null || buffer.length < size) {
-	        return new SafeUint32Array(Math.max(size + 1024, MIN_SIZE));
+	        return new Uint32Array(Math.max(size + 1024, MIN_SIZE));
 	    }
 
 	    return buffer;
@@ -10599,10 +10626,6 @@
 	 * http://opensource.org/licenses/BSD-3-Clause
 	 */
 
-	var util$2 = util$3;
-	var has$1 = Object.prototype.hasOwnProperty;
-	var hasNativeMap = typeof Map !== "undefined";
-
 	/**
 	 * A data structure which is a combination of an array and a set. Adding a new
 	 * member is O(1), testing for membership is O(1), and finding the index of an
@@ -10611,7 +10634,7 @@
 	 */
 	function ArraySet$1() {
 	  this._array = [];
-	  this._set = hasNativeMap ? new Map() : Object.create(null);
+	  this._set = new Map();
 	}
 
 	/**
@@ -10632,7 +10655,7 @@
 	 * @returns Number
 	 */
 	ArraySet$1.prototype.size = function ArraySet_size() {
-	  return hasNativeMap ? this._set.size : Object.getOwnPropertyNames(this._set).length;
+	  return this._set.size;
 	};
 
 	/**
@@ -10641,18 +10664,13 @@
 	 * @param String aStr
 	 */
 	ArraySet$1.prototype.add = function ArraySet_add(aStr, aAllowDuplicates) {
-	  var sStr = hasNativeMap ? aStr : util$2.toSetString(aStr);
-	  var isDuplicate = hasNativeMap ? this.has(aStr) : has$1.call(this._set, sStr);
+	  var isDuplicate = this.has(aStr);
 	  var idx = this._array.length;
 	  if (!isDuplicate || aAllowDuplicates) {
 	    this._array.push(aStr);
 	  }
 	  if (!isDuplicate) {
-	    if (hasNativeMap) {
-	      this._set.set(aStr, idx);
-	    } else {
-	      this._set[sStr] = idx;
-	    }
+	    this._set.set(aStr, idx);
 	  }
 	};
 
@@ -10662,12 +10680,7 @@
 	 * @param String aStr
 	 */
 	ArraySet$1.prototype.has = function ArraySet_has(aStr) {
-	  if (hasNativeMap) {
-	    return this._set.has(aStr);
-	  } else {
-	    var sStr = util$2.toSetString(aStr);
-	    return has$1.call(this._set, sStr);
-	  }
+	  return this._set.has(aStr);
 	};
 
 	/**
@@ -10676,18 +10689,10 @@
 	 * @param String aStr
 	 */
 	ArraySet$1.prototype.indexOf = function ArraySet_indexOf(aStr) {
-	  if (hasNativeMap) {
-	    var idx = this._set.get(aStr);
-	    if (idx >= 0) {
-	        return idx;
-	    }
-	  } else {
-	    var sStr = util$2.toSetString(aStr);
-	    if (has$1.call(this._set, sStr)) {
-	      return this._set[sStr];
-	    }
+	  var idx = this._set.get(aStr);
+	  if (idx >= 0) {
+	      return idx;
 	  }
-
 	  throw new Error('"' + aStr + '" is not in the set.');
 	};
 
@@ -26517,10 +26522,6 @@
 			// Replace urls
 			this.replaceUrls(this.ast);
 
-			// Scope
-			this.id = UUID();
-			// this.addScope(this.ast, this.uuid);
-
 			// Replace IDs with data-id
 			this.replaceIds(this.ast);
 
@@ -26551,7 +26552,7 @@
 			csstree.walk(ast, {
 				visit: "Url",
 				enter: (node, item, list) => {
-					this.hooks.onUrl.trigger(node, item, list);
+					_assertSync(this.hooks.onUrl.trigger(node, item, list), "onUrl");
 				}
 			});
 		}
@@ -26563,17 +26564,17 @@
 					const basename = csstree.keyword(node.name).basename;
 
 					if (basename === "page") {
-						this.hooks.onAtPage.trigger(node, item, list);
+						_assertSync(this.hooks.onAtPage.trigger(node, item, list), "onAtPage");
 						this.declarations(node, item, list);
 					}
 
 					if (basename === "media") {
-						this.hooks.onAtMedia.trigger(node, item, list);
+						_assertSync(this.hooks.onAtMedia.trigger(node, item, list), "onAtMedia");
 						this.declarations(node, item, list);
 					}
 
 					if (basename === "import") {
-						this.hooks.onImport.trigger(node, item, list);
+						_assertSync(this.hooks.onImport.trigger(node, item, list), "onImport");
 						this.imports(node, item, list);
 					}
 				}
@@ -26586,7 +26587,7 @@
 				visit: "Rule",
 				enter: (ruleNode, ruleItem, rulelist) => {
 
-					this.hooks.onRule.trigger(ruleNode, ruleItem, rulelist);
+					_assertSync(this.hooks.onRule.trigger(ruleNode, ruleItem, rulelist), "onRule");
 					this.declarations(ruleNode, ruleItem, rulelist);
 					this.onSelector(ruleNode, ruleItem, rulelist);
 
@@ -26599,13 +26600,13 @@
 				visit: "Declaration",
 				enter: (declarationNode, dItem, dList) => {
 
-					this.hooks.onDeclaration.trigger(declarationNode, dItem, dList, {ruleNode, ruleItem, rulelist});
+					_assertSync(this.hooks.onDeclaration.trigger(declarationNode, dItem, dList, {ruleNode, ruleItem, rulelist}), "onDeclaration");
 
 					if (declarationNode.property === "content") {
 						csstree.walk(declarationNode, {
 							visit: "Function",
 							enter: (funcNode, fItem, fList) => {
-								this.hooks.onContent.trigger(funcNode, fItem, fList, {declarationNode, dItem, dList}, {ruleNode, ruleItem, rulelist});
+								_assertSync(this.hooks.onContent.trigger(funcNode, fItem, fList, {declarationNode, dItem, dList}, {ruleNode, ruleItem, rulelist}), "onContent");
 							}
 						});
 					}
@@ -26619,13 +26620,13 @@
 			csstree.walk(ruleNode, {
 				visit: "Selector",
 				enter: (selectNode, selectItem, selectList) => {
-					this.hooks.onSelector.trigger(selectNode, selectItem, selectList, {ruleNode, ruleItem, rulelist});
+					_assertSync(this.hooks.onSelector.trigger(selectNode, selectItem, selectList, {ruleNode, ruleItem, rulelist}), "onSelector");
 
 					if (selectNode.children.forEach(node => {if (node.type === "PseudoElementSelector") {
 						csstree.walk(node, {
 							visit: "PseudoElementSelector",
 							enter: (pseudoNode, pItem, pList) => {
-								this.hooks.onPseudoSelector.trigger(pseudoNode, pItem, pList, {selectNode, selectItem, selectList}, {ruleNode, ruleItem, rulelist});
+								_assertSync(this.hooks.onPseudoSelector.trigger(pseudoNode, pItem, pList, {selectNode, selectItem, selectList}, {ruleNode, ruleItem, rulelist}), "onPseudoSelector");
 							}
 						});
 					}}));
@@ -27637,13 +27638,41 @@
 			this.polisher = polisher;
 			this.caller = caller;
 
+			// [PATCH: handler-self-disable] Track each (hook, bound) pair we
+			// register so handlers that find nothing to do for a given render
+			// can splice themselves back out. Footnotes uses this to disappear
+			// when the document and CSS produced no footnote-marked nodes;
+			// combined with Hook.trigger/triggerSync's empty-handlers fast
+			// path, the per-page and per-node dispatches then short-circuit.
+			this._registered = {};
+
 			for (let name in hooks) {
 				if (name in this) {
 					let hook = hooks[name];
-					hook.register(this[name].bind(this));
+					let bound = this[name].bind(this);
+					this._registered[name] = { hook, bound };
+					hook.register(bound);
 				}
 			}
 		}
+
+		/**
+		 * Remove this handler's registered callbacks from every hook it
+		 * subscribed to. Pass the name of the hook the caller is currently
+		 * inside (e.g. `"afterParsed"`) to skip its own entry -- splicing
+		 * the array we're iterating would cause the surrounding `trigger()`
+		 * loop to skip a sibling handler. The skipped entry is harmless on
+		 * one-shot hooks; on recurring hooks the caller can re-call later.
+		 */
+		_unregisterAll(except) {
+			for (const name in this._registered) {
+				if (name === except) continue;
+				const { hook, bound } = this._registered[name];
+				const idx = hook.hooks.indexOf(bound);
+				if (idx >= 0) hook.hooks.splice(idx, 1);
+				delete this._registered[name];
+			}
+		}
 	}
 
 	EventEmitter(Handler.prototype);
@@ -31281,6 +31310,18 @@
 
 		afterParsed(parsed) {
 			this.processFootnotes(parsed, this.footnotes);
+
+			// [PATCH: footnotes-self-disable] If neither source HTML nor CSS
+			// `float: footnote` rules produced any footnote-marked nodes, the
+			// remaining hooks (renderNode per element-node, afterPageLayout +
+			// beforePageLayout + afterOverflowRemoved per page) have nothing
+			// to do for the rest of this render. Unregister them so the
+			// empty-handlers fast-path in Hook.triggerSync short-circuits.
+			// afterParsed itself is skipped via `except` -- it's a one-shot
+			// and the surrounding trigger() loop is still iterating it.
+			if (!parsed.querySelector("[data-note='footnote']")) {
+				this._unregisterAll("afterParsed");
+			}
 		}
 
 		processFootnotes(parsed, notes) {
@@ -31336,7 +31377,12 @@
 
 				if (node.dataset.note === "footnote") {
 					notes = [node];
-				} else if (node.dataset.hasNotes || node.querySelectorAll("[data-note='footnote']")) {
+				} else if (node.dataset.hasNotes) {
+					// Upstream wrote `|| node.querySelectorAll(...)` here, but a
+					// NodeList is always truthy (even empty), so the right arm
+					// of the || always ran and the next line ran querySelectorAll
+					// again -- two subtree scans per element-node clone for any
+					// document that doesn't use data-note='footnote'.
 					notes = node.querySelectorAll("[data-note='footnote']");
 				}
 
@@ -31642,8 +31688,14 @@
 			let notes = area.querySelectorAll(".pagedjs_footnote_area [data-note='footnote']");
 			for (let n = 0; n < notes.length; n++) {
 				const note = notes[n];
-				// Check if the call for that footnote has been removed with the overflow
-				let call = removed.querySelector(`[data-footnote-call="${note.dataset.ref}"]`);
+				// [PATCH: extract-vs-delete] Guard `removed` access -- when
+				// removeOverflow took the deleteContents fast path (no
+				// footnotes in the rendered area), `removed` is null. In
+				// that case there are no rendered footnotes for the loop
+				// to iterate either, so we never actually enter this body.
+				// The guard is for future content where the area DOES have
+				// rendered footnotes but removeOverflow's pre-check changes.
+				let call = removed && removed.querySelector(`[data-footnote-call="${note.dataset.ref}"]`);
 				if (call) {
 					note.remove();
 				}
@@ -32725,599 +32777,21 @@
 		UndisplayedFilter
 	];
 
-	var isImplemented$4 = function () {
-		var from = Array.from, arr, result;
-		if (typeof from !== "function") return false;
-		arr = ["raz", "dwa"];
-		result = from(arr);
-		return Boolean(result && result !== arr && result[1] === "dwa");
-	};
-
-	var isImplemented$3;
-	var hasRequiredIsImplemented$1;
-
-	function requireIsImplemented$1 () {
-		if (hasRequiredIsImplemented$1) return isImplemented$3;
-		hasRequiredIsImplemented$1 = 1;
-
-		isImplemented$3 = function () {
-			if (typeof globalThis !== "object") return false;
-			if (!globalThis) return false;
-			return globalThis.Array === Array;
-		};
-		return isImplemented$3;
-	}
-
-	var implementation;
-	var hasRequiredImplementation;
-
-	function requireImplementation () {
-		if (hasRequiredImplementation) return implementation;
-		hasRequiredImplementation = 1;
-		var naiveFallback = function () {
-			if (typeof self === "object" && self) return self;
-			if (typeof window === "object" && window) return window;
-			throw new Error("Unable to resolve global `this`");
-		};
-
-		implementation = (function () {
-			if (this) return this;
-
-			// Unexpected strict mode (may happen if e.g. bundled into ESM module)
-
-			// Thanks @mathiasbynens -> https://mathiasbynens.be/notes/globalthis
-			// In all ES5+ engines global object inherits from Object.prototype
-			// (if you approached one that doesn't please report)
-			try {
-				Object.defineProperty(Object.prototype, "__global__", {
-					get: function () { return this; },
-					configurable: true
-				});
-			} catch (error) {
-				// Unfortunate case of Object.prototype being sealed (via preventExtensions, seal or freeze)
-				return naiveFallback();
-			}
-			try {
-				// Safari case (window.__global__ is resolved with global context, but __global__ does not)
-				if (!__global__) return naiveFallback();
-				return __global__;
-			} finally {
-				delete Object.prototype.__global__;
-			}
-		})();
-		return implementation;
-	}
-
-	var globalThis_1;
-	var hasRequiredGlobalThis;
-
-	function requireGlobalThis () {
-		if (hasRequiredGlobalThis) return globalThis_1;
-		hasRequiredGlobalThis = 1;
-
-		globalThis_1 = requireIsImplemented$1()() ? globalThis : requireImplementation();
-		return globalThis_1;
-	}
-
-	var isImplemented$2;
-	var hasRequiredIsImplemented;
-
-	function requireIsImplemented () {
-		if (hasRequiredIsImplemented) return isImplemented$2;
-		hasRequiredIsImplemented = 1;
-
-		var global     = requireGlobalThis()
-		  , validTypes = { object: true, symbol: true };
-
-		isImplemented$2 = function () {
-			var Symbol = global.Symbol;
-			var symbol;
-			if (typeof Symbol !== "function") return false;
-			symbol = Symbol("test symbol");
-			try { String(symbol); }
-			catch (e) { return false; }
-
-			// Return 'true' also for polyfills
-			if (!validTypes[typeof Symbol.iterator]) return false;
-			if (!validTypes[typeof Symbol.toPrimitive]) return false;
-			if (!validTypes[typeof Symbol.toStringTag]) return false;
-
-			return true;
-		};
-		return isImplemented$2;
-	}
-
-	var isSymbol;
-	var hasRequiredIsSymbol;
-
-	function requireIsSymbol () {
-		if (hasRequiredIsSymbol) return isSymbol;
-		hasRequiredIsSymbol = 1;
-
-		isSymbol = function (value) {
-			if (!value) return false;
-			if (typeof value === "symbol") return true;
-			if (!value.constructor) return false;
-			if (value.constructor.name !== "Symbol") return false;
-			return value[value.constructor.toStringTag] === "Symbol";
-		};
-		return isSymbol;
-	}
-
-	var validateSymbol;
-	var hasRequiredValidateSymbol;
-
-	function requireValidateSymbol () {
-		if (hasRequiredValidateSymbol) return validateSymbol;
-		hasRequiredValidateSymbol = 1;
-
-		var isSymbol = requireIsSymbol();
-
-		validateSymbol = function (value) {
-			if (!isSymbol(value)) throw new TypeError(value + " is not a symbol");
-			return value;
-		};
-		return validateSymbol;
-	}
-
-	var generateName;
-	var hasRequiredGenerateName;
-
-	function requireGenerateName () {
-		if (hasRequiredGenerateName) return generateName;
-		hasRequiredGenerateName = 1;
-
-		var d = dExports;
-
-		var create = Object.create, defineProperty = Object.defineProperty, objPrototype = Object.prototype;
-
-		var created = create(null);
-		generateName = function (desc) {
-			var postfix = 0, name, ie11BugWorkaround;
-			while (created[desc + (postfix || "")]) ++postfix;
-			desc += postfix || "";
-			created[desc] = true;
-			name = "@@" + desc;
-			defineProperty(
-				objPrototype,
-				name,
-				d.gs(null, function (value) {
-					// For IE11 issue see:
-					// https://connect.microsoft.com/IE/feedbackdetail/view/1928508/
-					//    ie11-broken-getters-on-dom-objects
-					// https://github.com/medikoo/es6-symbol/issues/12
-					if (ie11BugWorkaround) return;
-					ie11BugWorkaround = true;
-					defineProperty(this, name, d(value));
-					ie11BugWorkaround = false;
-				})
-			);
-			return name;
-		};
-		return generateName;
-	}
-
-	var standardSymbols;
-	var hasRequiredStandardSymbols;
-
-	function requireStandardSymbols () {
-		if (hasRequiredStandardSymbols) return standardSymbols;
-		hasRequiredStandardSymbols = 1;
-
-		var d            = dExports
-		  , NativeSymbol = requireGlobalThis().Symbol;
-
-		standardSymbols = function (SymbolPolyfill) {
-			return Object.defineProperties(SymbolPolyfill, {
-				// To ensure proper interoperability with other native functions (e.g. Array.from)
-				// fallback to eventual native implementation of given symbol
-				hasInstance: d(
-					"", (NativeSymbol && NativeSymbol.hasInstance) || SymbolPolyfill("hasInstance")
-				),
-				isConcatSpreadable: d(
-					"",
-					(NativeSymbol && NativeSymbol.isConcatSpreadable) ||
-						SymbolPolyfill("isConcatSpreadable")
-				),
-				iterator: d("", (NativeSymbol && NativeSymbol.iterator) || SymbolPolyfill("iterator")),
-				match: d("", (NativeSymbol && NativeSymbol.match) || SymbolPolyfill("match")),
-				replace: d("", (NativeSymbol && NativeSymbol.replace) || SymbolPolyfill("replace")),
-				search: d("", (NativeSymbol && NativeSymbol.search) || SymbolPolyfill("search")),
-				species: d("", (NativeSymbol && NativeSymbol.species) || SymbolPolyfill("species")),
-				split: d("", (NativeSymbol && NativeSymbol.split) || SymbolPolyfill("split")),
-				toPrimitive: d(
-					"", (NativeSymbol && NativeSymbol.toPrimitive) || SymbolPolyfill("toPrimitive")
-				),
-				toStringTag: d(
-					"", (NativeSymbol && NativeSymbol.toStringTag) || SymbolPolyfill("toStringTag")
-				),
-				unscopables: d(
-					"", (NativeSymbol && NativeSymbol.unscopables) || SymbolPolyfill("unscopables")
-				)
-			});
-		};
-		return standardSymbols;
-	}
-
-	var symbolRegistry;
-	var hasRequiredSymbolRegistry;
-
-	function requireSymbolRegistry () {
-		if (hasRequiredSymbolRegistry) return symbolRegistry;
-		hasRequiredSymbolRegistry = 1;
-
-		var d              = dExports
-		  , validateSymbol = requireValidateSymbol();
-
-		var registry = Object.create(null);
-
-		symbolRegistry = function (SymbolPolyfill) {
-			return Object.defineProperties(SymbolPolyfill, {
-				for: d(function (key) {
-					if (registry[key]) return registry[key];
-					return (registry[key] = SymbolPolyfill(String(key)));
-				}),
-				keyFor: d(function (symbol) {
-					var key;
-					validateSymbol(symbol);
-					for (key in registry) {
-						if (registry[key] === symbol) return key;
-					}
-					return undefined;
-				})
-			});
-		};
-		return symbolRegistry;
-	}
-
-	var polyfill;
-	var hasRequiredPolyfill;
-
-	function requirePolyfill () {
-		if (hasRequiredPolyfill) return polyfill;
-		hasRequiredPolyfill = 1;
-
-		var d                    = dExports
-		  , validateSymbol       = requireValidateSymbol()
-		  , NativeSymbol         = requireGlobalThis().Symbol
-		  , generateName         = requireGenerateName()
-		  , setupStandardSymbols = requireStandardSymbols()
-		  , setupSymbolRegistry  = requireSymbolRegistry();
-
-		var create = Object.create
-		  , defineProperties = Object.defineProperties
-		  , defineProperty = Object.defineProperty;
-
-		var SymbolPolyfill, HiddenSymbol, isNativeSafe;
-
-		if (typeof NativeSymbol === "function") {
-			try {
-				String(NativeSymbol());
-				isNativeSafe = true;
-			} catch (ignore) {}
-		} else {
-			NativeSymbol = null;
-		}
-
-		// Internal constructor (not one exposed) for creating Symbol instances.
-		// This one is used to ensure that `someSymbol instanceof Symbol` always return false
-		HiddenSymbol = function Symbol(description) {
-			if (this instanceof HiddenSymbol) throw new TypeError("Symbol is not a constructor");
-			return SymbolPolyfill(description);
-		};
-
-		// Exposed `Symbol` constructor
-		// (returns instances of HiddenSymbol)
-		polyfill = SymbolPolyfill = function Symbol(description) {
-			var symbol;
-			if (this instanceof Symbol) throw new TypeError("Symbol is not a constructor");
-			if (isNativeSafe) return NativeSymbol(description);
-			symbol = create(HiddenSymbol.prototype);
-			description = description === undefined ? "" : String(description);
-			return defineProperties(symbol, {
-				__description__: d("", description),
-				__name__: d("", generateName(description))
-			});
-		};
-
-		setupStandardSymbols(SymbolPolyfill);
-		setupSymbolRegistry(SymbolPolyfill);
-
-		// Internal tweaks for real symbol producer
-		defineProperties(HiddenSymbol.prototype, {
-			constructor: d(SymbolPolyfill),
-			toString: d("", function () { return this.__name__; })
-		});
-
-		// Proper implementation of methods exposed on Symbol.prototype
-		// They won't be accessible on produced symbol instances as they derive from HiddenSymbol.prototype
-		defineProperties(SymbolPolyfill.prototype, {
-			toString: d(function () { return "Symbol (" + validateSymbol(this).__description__ + ")"; }),
-			valueOf: d(function () { return validateSymbol(this); })
-		});
-		defineProperty(
-			SymbolPolyfill.prototype,
-			SymbolPolyfill.toPrimitive,
-			d("", function () {
-				var symbol = validateSymbol(this);
-				if (typeof symbol === "symbol") return symbol;
-				return symbol.toString();
-			})
-		);
-		defineProperty(SymbolPolyfill.prototype, SymbolPolyfill.toStringTag, d("c", "Symbol"));
-
-		// Proper implementaton of toPrimitive and toStringTag for returned symbol instances
-		defineProperty(
-			HiddenSymbol.prototype, SymbolPolyfill.toStringTag,
-			d("c", SymbolPolyfill.prototype[SymbolPolyfill.toStringTag])
-		);
-
-		// Note: It's important to define `toPrimitive` as last one, as some implementations
-		// implement `toPrimitive` natively without implementing `toStringTag` (or other specified symbols)
-		// And that may invoke error in definition flow:
-		// See: https://github.com/medikoo/es6-symbol/issues/13#issuecomment-164146149
-		defineProperty(
-			HiddenSymbol.prototype, SymbolPolyfill.toPrimitive,
-			d("c", SymbolPolyfill.prototype[SymbolPolyfill.toPrimitive])
-		);
-		return polyfill;
-	}
-
-	var es6Symbol;
-	var hasRequiredEs6Symbol;
-
-	function requireEs6Symbol () {
-		if (hasRequiredEs6Symbol) return es6Symbol;
-		hasRequiredEs6Symbol = 1;
-
-		es6Symbol = requireIsImplemented()()
-			? requireGlobalThis().Symbol
-			: requirePolyfill();
-		return es6Symbol;
-	}
-
-	var isArguments;
-	var hasRequiredIsArguments;
-
-	function requireIsArguments () {
-		if (hasRequiredIsArguments) return isArguments;
-		hasRequiredIsArguments = 1;
-
-		var objToString = Object.prototype.toString
-		  , id = objToString.call((function () { return arguments; })());
-
-		isArguments = function (value) { return objToString.call(value) === id; };
-		return isArguments;
-	}
-
-	var isFunction;
-	var hasRequiredIsFunction;
-
-	function requireIsFunction () {
-		if (hasRequiredIsFunction) return isFunction;
-		hasRequiredIsFunction = 1;
-
-		var objToString = Object.prototype.toString
-		  , isFunctionStringTag = RegExp.prototype.test.bind(/^[object [A-Za-z0-9]*Function]$/);
-
-		isFunction = function (value) {
-			return typeof value === "function" && isFunctionStringTag(objToString.call(value));
-		};
-		return isFunction;
-	}
-
-	var isImplemented$1 = function () {
-		var sign = Math.sign;
-		if (typeof sign !== "function") return false;
-		return sign(10) === 1 && sign(-20) === -1;
-	};
-
-	var shim$2;
-	var hasRequiredShim$2;
-
-	function requireShim$2 () {
-		if (hasRequiredShim$2) return shim$2;
-		hasRequiredShim$2 = 1;
-
-		shim$2 = function (value) {
-			value = Number(value);
-			if (isNaN(value) || value === 0) return value;
-			return value > 0 ? 1 : -1;
-		};
-		return shim$2;
-	}
-
-	var sign$1 = isImplemented$1() ? Math.sign : requireShim$2();
-
-	var sign  = sign$1
-	  , abs$1   = Math.abs
+	var abs$1   = Math.abs
 	  , floor$1 = Math.floor;
 
-	var toInteger$1 = function (value) {
+	var toInteger = function (value) {
 		if (isNaN(value)) return 0;
 		value = Number(value);
 		if (value === 0 || !isFinite(value)) return value;
-		return sign(value) * floor$1(abs$1(value));
+		return Math.sign(value) * floor$1(abs$1(value));
 	};
 
-	var toInteger = toInteger$1
-	  , max       = Math.max;
+	var max = Math.max;
 
 	var toPosInteger = function (value) { return max(0, toInteger(value)); };
 
-	var isString;
-	var hasRequiredIsString;
-
-	function requireIsString () {
-		if (hasRequiredIsString) return isString;
-		hasRequiredIsString = 1;
-
-		var objToString = Object.prototype.toString, id = objToString.call("");
-
-		isString = function (value) {
-			return (
-				typeof value === "string" ||
-				(value &&
-					typeof value === "object" &&
-					(value instanceof String || objToString.call(value) === id)) ||
-				false
-			);
-		};
-		return isString;
-	}
-
-	var shim$1;
-	var hasRequiredShim$1;
-
-	function requireShim$1 () {
-		if (hasRequiredShim$1) return shim$1;
-		hasRequiredShim$1 = 1;
-
-		var iteratorSymbol = requireEs6Symbol().iterator
-		  , isArguments    = requireIsArguments()
-		  , isFunction     = requireIsFunction()
-		  , toPosInt       = toPosInteger
-		  , callable       = validCallable
-		  , validValue$1     = validValue
-		  , isValue        = isValue$4
-		  , isString       = requireIsString()
-		  , isArray        = Array.isArray
-		  , call           = Function.prototype.call
-		  , desc           = { configurable: true, enumerable: true, writable: true, value: null }
-		  , defineProperty = Object.defineProperty;
-
-		// eslint-disable-next-line complexity, max-lines-per-function
-		shim$1 = function (arrayLike /*, mapFn, thisArg*/) {
-			var mapFn = arguments[1]
-			  , thisArg = arguments[2]
-			  , Context
-			  , i
-			  , j
-			  , arr
-			  , length
-			  , code
-			  , iterator
-			  , result
-			  , getIterator
-			  , value;
-
-			arrayLike = Object(validValue$1(arrayLike));
-
-			if (isValue(mapFn)) callable(mapFn);
-			if (!this || this === Array || !isFunction(this)) {
-				// Result: Plain array
-				if (!mapFn) {
-					if (isArguments(arrayLike)) {
-						// Source: Arguments
-						length = arrayLike.length;
-						if (length !== 1) return Array.apply(null, arrayLike);
-						arr = new Array(1);
-						arr[0] = arrayLike[0];
-						return arr;
-					}
-					if (isArray(arrayLike)) {
-						// Source: Array
-						arr = new Array((length = arrayLike.length));
-						for (i = 0; i < length; ++i) arr[i] = arrayLike[i];
-						return arr;
-					}
-				}
-				arr = [];
-			} else {
-				// Result: Non plain array
-				Context = this;
-			}
-
-			if (!isArray(arrayLike)) {
-				if ((getIterator = arrayLike[iteratorSymbol]) !== undefined) {
-					// Source: Iterator
-					iterator = callable(getIterator).call(arrayLike);
-					if (Context) arr = new Context();
-					result = iterator.next();
-					i = 0;
-					while (!result.done) {
-						value = mapFn ? call.call(mapFn, thisArg, result.value, i) : result.value;
-						if (Context) {
-							desc.value = value;
-							defineProperty(arr, i, desc);
-						} else {
-							arr[i] = value;
-						}
-						result = iterator.next();
-						++i;
-					}
-					length = i;
-				} else if (isString(arrayLike)) {
-					// Source: String
-					length = arrayLike.length;
-					if (Context) arr = new Context();
-					for (i = 0, j = 0; i < length; ++i) {
-						value = arrayLike[i];
-						if (i + 1 < length) {
-							code = value.charCodeAt(0);
-							// eslint-disable-next-line max-depth
-							if (code >= 0xd800 && code <= 0xdbff) value += arrayLike[++i];
-						}
-						value = mapFn ? call.call(mapFn, thisArg, value, j) : value;
-						if (Context) {
-							desc.value = value;
-							defineProperty(arr, j, desc);
-						} else {
-							arr[j] = value;
-						}
-						++j;
-					}
-					length = j;
-				}
-			}
-			if (length === undefined) {
-				// Source: array or array-like
-				length = toPosInt(arrayLike.length);
-				if (Context) arr = new Context(length);
-				for (i = 0; i < length; ++i) {
-					value = mapFn ? call.call(mapFn, thisArg, arrayLike[i], i) : arrayLike[i];
-					if (Context) {
-						desc.value = value;
-						defineProperty(arr, i, desc);
-					} else {
-						arr[i] = value;
-					}
-				}
-			}
-			if (Context) {
-				desc.value = null;
-				arr.length = length;
-			}
-			return arr;
-		};
-		return shim$1;
-	}
-
-	var from = isImplemented$4() ? Array.from : requireShim$1();
-
-	var isImplemented = function () {
-		var numberIsNaN = Number.isNaN;
-		if (typeof numberIsNaN !== "function") return false;
-		return !numberIsNaN({}) && numberIsNaN(NaN) && !numberIsNaN(34);
-	};
-
-	var shim;
-	var hasRequiredShim;
-
-	function requireShim () {
-		if (hasRequiredShim) return shim;
-		hasRequiredShim = 1;
-
-		shim = function (value) {
-			// eslint-disable-next-line no-self-compare
-			return value !== value;
-		};
-		return shim;
-	}
-
-	var isNan = isImplemented() ? Number.isNaN : requireShim();
-
-	var numberIsNaN       = isNan
+	var numberIsNaN       = Number.isNaN
 	  , toPosInt          = toPosInteger
 	  , value$1             = validValue
 	  , indexOf$1           = Array.prototype.indexOf
@@ -33349,7 +32823,7 @@
 	  , splice  = Array.prototype.splice;
 
 	// eslint-disable-next-line no-unused-vars
-	var remove$1 = function (itemToRemove /*, …item*/) {
+	var remove = function (itemToRemove /*, …item*/) {
 		forEach.call(
 			arguments,
 			function (item) {
@@ -33364,17 +32838,14 @@
 
 	var map = { function: true, object: true };
 
-	var isObject$1 = function (value) { return (isValue(value) && map[typeof value]) || false; };
-
-	var isObject = isObject$1;
+	var isObject = function (value) { return (isValue(value) && map[typeof value]) || false; };
 
 	var validObject = function (value) {
 		if (!isObject(value)) throw new TypeError(value + " is not an Object");
 		return value;
 	};
 
-	var aFrom          = from
-	  , remove         = remove$1
+	var aFrom          = Array.from
 	  , value          = validObject
 	  , d              = dExports
 	  , emit           = eventEmitterExports.methods.emit
@@ -33522,16 +32993,39 @@
 			let template;
 			template = body.querySelector(":scope > template[data-ref='pagedjs-content']");
 
-			if (!template) {
-				// Otherwise create one
-				template = document.createElement("template");
-				template.dataset.ref = "pagedjs-content";
-				template.innerHTML = body.innerHTML;
-				body.innerHTML = "";
-				body.appendChild(template);
+			if (template) {
+				// [PATCH: wrap-content-move] Re-entrant call: the fragment we
+				// returned previously was stashed on the marker template's
+				// `_pagedjsContent` expando (template.content stays empty under
+				// the move strategy below).
+				return template._pagedjsContent || template.content;
 			}
 
-			return template.content;
+			// [PATCH: wrap-content-move] Move children into a plain
+			// DocumentFragment owned by the live document instead of round-
+			// tripping through innerHTML (serialise the entire body to a
+			// string, reparse into a template). The round-trip is O(document
+			// size) twice over; the move is one O(n) detach/attach pass with
+			// no string work.
+			//
+			// Why a plain DocumentFragment and not template.content: a
+			// template's content fragment is owned by the inert "template
+			// contents owner document", and moving live <img> elements into
+			// it triggers adoptNode which runs the spec's "update the image
+			// data" algorithm. That resets .complete and leaves the source
+			// image in a state where later cloning into the live page wrapper
+			// doesn't synchronously cache-hit -- our sync waitForImages check
+			// then throws. A plain fragment stays in the live document so
+			// adoption is a no-op and image state is preserved.
+			let fragment = document.createDocumentFragment();
+			while (body.firstChild) {
+				fragment.appendChild(body.firstChild);
+			}
+			template = document.createElement("template");
+			template.dataset.ref = "pagedjs-content";
+			template._pagedjsContent = fragment;
+			body.appendChild(template);
+			return fragment;
 		}
 
 		removeStyles(doc=document) {
diff --git a/perf/README.md b/perf/README.md
index 135e873d..135d2de2 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -12,6 +12,44 @@ page count roughly quadruples the total render time.
 
 This folder holds the tools used to investigate that.
 
+## Profiling `paged.browser.js`: canonical command
+
+The command we reach for whenever CPU-profiling paged.js:
+
+```
+node measure.mjs --detach-pages --no-timing --render-only --cpu-profile --cpu-sampling 100
+```
+
+(`run.bat` forwards the same args.) Flag rationale:
+
+- `--detach-pages` -- inject the shipping fix. The profile reflects
+  what production actually pays, not the old O(n^2) baseline.
+- `--no-timing` -- skip the per-page `console.log` relay from
+  `timing-handler.js`. The relay costs ~2 % of render self-time on
+  the 1638-page book and muddies the bottom-up view.
+- `--render-only` -- bail out after `PagedPolyfill.preview()`
+  returns. Skips meta extraction, `parseOutline`, `page.pdf`, and
+  the pdf-lib roundtrip / incremental writer. ~47 s saved per run
+  on the book (~55 s full -> ~8 s render-only), with no effect on
+  what the `--cpu-profile` trace captures (it already covered only
+  the render phase).
+- `--cpu-profile` -- write `render.cpuprofile` (render phase only)
+  into the timestamped `results/` folder. Open in Chrome DevTools via
+  Performance -> "Load profile...", or interrogate from the terminal
+  with `analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs`
+  / `grep-profile.mjs`.
+- `--cpu-sampling 100` -- 100 us sampling, 10x denser than the 1 ms
+  default. Resolves frames in paged.js's sub-millisecond inner loops
+  where most remaining cost lives (see "Looking past `finalizePage`"
+  and later sections). Larger profile file in return.
+
+Drop `--render-only` whenever you need to also measure generate /
+process (e.g. confirming a fix doesn't shift cost into `page.pdf()`
+or pdf-lib), or to write `book.pdf` for behavioural verification.
+
+The rest of this README is the long-form narrative -- baseline
+findings, each landed optimisation, and the residual hotspots.
+
 ## The plan
 
 The render pipeline has three phases, matching what `pagedjs-cli`
@@ -60,6 +98,7 @@ DevTools-compatible trace is a few lines.
 | `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The fix. Injected by `--detach-pages` and by `docs/book.bat`. |
 | `instrument-flush-ops.js` | Wraps `getComputedStyle`, `getBoundingClientRect`, and the `offsetWidth` / `clientWidth` / `scrollWidth` family with counters + per-call timing. Injected by `--instrument`. |
 | `time-hooks.js` | Wraps every task registered to `chunker.hooks.*` and `polisher.hooks.*` with a wall-clock timer. Tells you which handler's hook method is eating render time, per page. Injected by `--time-hooks`. |
+| `instrument-clones.js` | Wraps `Layout.prototype.append` to tag every source-walker clone, then walks each finalized page at `finalizePage` counting tagged survivors. Reports total appendCalls vs. survivors and the per-page overshoot distribution -- the share of clones rolled back by `removeOverflow`. Requires a one-line `window.PagedLayout = Layout` patch near the bottom of `docs/lib/paged.browser.js` (it's a private class otherwise). Injected by `--clone-count`. |
 | `incremental-pdf.mjs` | Replaces the pdf-lib load+save roundtrip with a PDF 1.7 §7.5.6 incremental update appended to Chrome's bytes. Used by `--incremental`. |
 | `test-incremental.mjs` | Smoke test for `incremental-pdf.mjs`: renders a tiny probe page, runs the writer, verifies the result parses (via pdf-lib re-load) and that outline + metadata land correctly. |
 | `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. |
@@ -68,6 +107,9 @@ DevTools-compatible trace is a few lines.
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
+| `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
+| `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
+| `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
 | `run.bat` | Windows wrapper. Installs deps on first run, then invokes `node measure.mjs`. |
 | `results/` | Output, one timestamped subfolder per run. Git-ignored. |
 
@@ -116,6 +158,8 @@ run.bat path\to\some-other.html           # explicit input
 run.bat --out my-run                      # explicit output directory
 run.bat --detach-pages                    # inject the detach-pages fix
 run.bat --cpu-profile                     # CPU-profile the render phase
+run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
+run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
 run.bat --instrument                      # count + time DOM-accessor calls
 run.bat --time-hooks                      # per-task timing of every chunker/polisher hook
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
@@ -2437,3 +2481,1641 @@ generate. After the puppeteer 25 bump it would save less than the
 earlier estimate (the 64 s -> 43 s gain made the target smaller),
 but it's still the only knob with a profile target large enough to
 move the wall-clock total by 5+ s.
+
+## Can we make `removeChild` cheaper?
+
+After the findRef fix, `removeChild` sits at ~12 % of render
+self-time. The detach-pages handler attribution is clean -- 1651
+detaches for 1651 pages, exactly one per page, with the only
+other removeChild callers being `filterTree` at startup (9,192
+ignorable-text-node strips totalling 2.3 ms; not a hot path).
+
+Per-call cost on the 1651-page book, with `Element.prototype.removeChild`
+wrapped to measure each call:
+
+```
+[instrument] page-detach avg:      1.009 ms/call
+[instrument] page-detach median:   0.900 ms/call
+[instrument] page-detach p90:      2.000 ms/call
+[instrument] page-detach p99:      3.000 ms/call
+[instrument] avg descendants/page: 147.7
+```
+
+That's ~5-7 us per descendant LayoutObject torn down, multiplied
+by ~150 descendants per page, multiplied by ~1651 pages = ~1.7 s
+total. The distribution is tight and scales linearly with
+descendant count -- this looks like ordinary Blink teardown work
+rather than a pathological slow path.
+
+To verify, two structural variants both tested at the same
+instrumentation harness:
+
+### Variant B: graveyard DocumentFragment
+
+Replace `parent.removeChild(page)` with
+`graveyard.appendChild(page)`, where `graveyard` is a fresh
+`DocumentFragment` held by the handler. Hypothesis: the
+move-to-out-of-document-fragment path might skip some
+LayoutObject teardown work because the destination is itself
+disconnected.
+
+| metric | A (removeChild) | B (graveyard) |
+| ------ | --------------- | ------------- |
+| avg per call | **1.009 ms** | 1.082 ms (+7 %) |
+| median | 0.900 ms | 0.900 ms |
+| p90 | 2.000 ms | 2.200 ms |
+| p99 | 3.000 ms | 3.100 ms |
+| total page wall | 1666 ms | 1785 ms |
+| render wall-clock | ~16.1 s | ~15.2 s (run-to-run noise) |
+
+The graveyard move is **slightly slower** per call. Blink tears
+down the LayoutObjects regardless of where the node lands; there's
+no fast-path for "moved to a detached parent". No win.
+
+### Variant C: `contain: layout style` on `.pagedjs_page`
+
+Inject `<style>.pagedjs_page { contain: layout style; }</style>`
+into the document before render. Hypothesis: removing a contained
+subtree might skip style/layout invalidation propagation because
+Blink already knows the subtree didn't influence its siblings or
+parent.
+
+Also tested `contain: strict` (which adds `paint` and `size`
+containment -- pages already have explicit dimensions via @page
+CSS so this is safe).
+
+| metric | A (no contain) | C (layout style) | C-strict |
+| ------ | -------------- | ---------------- | -------- |
+| avg per call | **1.009 ms** | 1.017 ms | 0.991 ms |
+| median | 0.900 ms | 0.900 ms | 0.900 ms |
+| p90 | 2.000 ms | 1.900 ms | 1.900 ms |
+| total page wall | 1666 ms | 1678 ms | 1634 ms |
+| render wall-clock | ~16.1 s | ~15.0 s | ~14.8 s |
+
+All four runs are within ~5 % of each other on per-call cost --
+well inside the run-to-run noise band. Containment doesn't unlock
+a faster removeChild path either.
+
+### Conclusion (variants B + C)
+
+The 1.7 s of `removeChild` is intrinsic Blink LayoutObject
+teardown work. The math checks out at ~5-7 us per descendant ×
+~150 descendants × 1651 pages, and three different framings
+(plain removeChild, move-to-fragment, contain + removeChild) all
+land within ~10 % of each other. The destination of the move and
+the containment metadata don't change Blink's teardown rate.
+
+The one thing we *don't* do is "remove less per page" -- removing
+a page's content as N individual leaf removals would be strictly
+worse (N × overhead instead of 1 × overhead, same teardown total).
+Each removeChild call carries DOM-mutation, style-invalidation,
+and notify overhead beyond the per-descendant cost, so consolidating
+to one removal per page is already the optimal framing.
+
+### Variant D: don't detach at all, just `contain: strict`
+
+A natural follow-up: if the per-page cost of having siblings
+around really comes from style/selector traversal, maybe Blink
+will skip a *contained* sibling subtree even when it can't skip
+a `display: none` one. Containment is a stronger signal -- it
+explicitly tells the engine "no observable interaction crosses
+this boundary" -- so the renderer ought to be able to short-circuit
+sibling-walks more aggressively.
+
+Implementation: replace the detach handler with one that sets
+`pageElement.style.contain = 'strict'` at finalizePage and clears
+the property for every page at afterRendered (so `page.pdf()`
+serializes the right paint state).
+
+Result:
+
+| metric | current detach | variant D (contain:strict, no detach) |
+| ------ | -------------- | --------------------------------------- |
+| **render wall-clock** | **~16 s** | **89.3 s** |
+| `Page.create` gBCR | ~764 ms | **31,142 ms** |
+| `hasOverflow` gBCR | ~2,478 ms | 10,922 ms |
+| total gBCR | ~4,832 ms | 45,413 ms |
+| per-page ratio (last/first) | 1.36x | 4.11x |
+
+Worse than the README's display:none baseline (`Page.create`
+gBCR 12,947 ms / render 48.5 s). Containment metadata adds work
+to per-sibling evaluation rather than removing it. **Definitive
+no.** Containment is a hint about what's inside the box; it
+doesn't make the box invisible to neighbours.
+
+### Variant E: empty the wrapper, leave it in place
+
+A second framing of the same idea: keep the page wrapper as a
+sibling, but move its children to a stash so the wrapper itself
+is a leaf (no descendants for Blink to walk through). Restore
+the children at afterRendered. This isolates the "what costs
+what" question: does sibling-walk cost depend on descendant
+count, or just on sibling count?
+
+Implementation: at finalizePage, for the previous-finalized page
+(one behind, mirroring the keep-one-back pattern), move each
+child into an array via `wrapper.removeChild(wrapper.firstChild)`,
+set `min-height: 297mm` so the wrapper still occupies its slot,
+and stash the children. At afterRendered, restore.
+
+Result:
+
+| metric | current detach | variant E (empty wrapper) |
+| ------ | -------------- | --------------------------- |
+| **render wall-clock** | **~16 s** | **21.9 s** |
+| `Page.create` gBCR | ~764 ms | 2,628 ms (+1,864) |
+| `hasOverflow` gBCR | ~2,478 ms | 5,024 ms (+2,546) |
+| `Layout` gBCR | ~294 ms | 937 ms |
+| total gBCR | ~4,832 ms | **10,127 ms (+5,295)** |
+| `removeChild` self | 2,426 ms | **854 ms (-1,572)** |
+| per-page ratio (last/first) | 1.36x | 2.93x |
+
+The removeChild *savings* are real -- with no wrapper to tear
+down, just ~150 child removals per page at sub-microsecond each.
+But the gBCR *cost* roughly doubles because the wrappers are
+still siblings, and gBCR firings have to walk them. Net is +5 s
+render, *worse* than the current detach.
+
+This experiment yields a clean cost-model decomposition. Pulling
+the gBCR deltas apart against the wrapper-vs-content split:
+
+```
+display:none baseline (full content):       gBCR(Page.create) ≈ 12,947 ms
+variant E (empty wrappers, n=1651):         gBCR(Page.create) ≈  2,628 ms
+current detach (no siblings):               gBCR(Page.create) ≈    764 ms
+```
+
+Subtracting:
+
+- (variant E - current detach) = 1,864 ms for 1,651 sibling wrappers
+  → ~1.1 us per wrapper-sibling per `Page.create` gBCR call
+- (display:none - variant E) = 10,319 ms for 1,651 × 150 ≈
+  247,650 sibling descendants
+  → ~42 us per sibling-descendant per `Page.create` gBCR call
+
+Both wrappers and their descendants contribute to the per-call
+cost. Removing the descendants helps -- variant E really is
+substantially cheaper than display:none -- but the wrapper cost
+alone is enough to lose. To zero out both contributions you have
+to take both the wrapper and its descendants out of the sibling
+list, which is exactly what the current detach does.
+
+### Variant F: `content-visibility: hidden`, no detach
+
+The CSS spec's `content-visibility: hidden` is the closest
+property to "freeze in place without disposing" -- per spec,
+rendering work is "skipped" but cached state is preserved for
+cheap restoration. Conceptually nearer to a freeze than
+`display: none` or `contain: strict` were.
+
+Implementation: at finalizePage, set
+`pageElement.style.contentVisibility = 'hidden'` and
+`containIntrinsicSize = '210mm 297mm'` (the size hint Blink uses
+when content-visibility skips a subtree). At afterRendered,
+clear both.
+
+Result:
+
+| metric | current detach | variant F (cv:hidden) |
+| ------ | -------------- | ----------------------- |
+| **render wall-clock** | **~16 s** | **95.2 s** |
+| `Page.create` gBCR | ~764 ms | **29,656 ms** |
+| `hasOverflow` gBCR | ~2,478 ms | 17,558 ms |
+| total gBCR | ~4,832 ms | 52,899 ms |
+| per-page ratio (last/first) | 1.36x | 5.12x |
+
+Worse than every other variant. The spec's "skip rendering work"
+clause covers painting and composition; it does **not** make the
+subtree invisible to sibling-walks during style and selector
+matching that gBCR forces. Three "leave in place" properties
+(`display: none`, `contain: strict`, `content-visibility: hidden`)
+have now been tested and none of them short-circuit the
+sibling-walk.
+
+### Conclusion across all six variants
+
+| variant | render | net vs current |
+| ------- | ------ | -------------- |
+| A current (removeChild, no contain) | ~16.1 s | (baseline) |
+| B graveyard fragment | ~15.2 s | flat (noise) |
+| C `contain: layout style` + removeChild | ~15.0 s | flat (noise) |
+| C-strict `contain: strict` + removeChild | ~14.8 s | flat (noise) |
+| **D `contain: strict`, no detach** | **89.3 s** | **+73 s** |
+| **E empty wrappers, no detach** | **21.9 s** | **+5.9 s** |
+| **F `content-visibility: hidden`, no detach** | **95.2 s** | **+79 s** |
+
+The flat band (A/B/C/C-strict) is the cost-of-doing-business --
+~1 ms × 1651 pages = ~1.7 s of intrinsic Blink LayoutObject
+teardown. Variations on the framing don't move it. The
+catastrophic band (D, E) confirms that any path where the page
+wrapper stays in the live sibling list pays meaningfully more
+than the teardown cost would have been -- ~1.1 us per
+wrapper-sibling × 1651 wrappers × several gBCR call sites per
+page comes out to several seconds of extra render even when the
+wrapper is otherwise empty and contained.
+
+The 1.7 s is the bill we pay for shrinking the live DOM from
+~150 × 1651 ≈ 250k nodes back down to 2 nodes (in-flight page +
+keeper), which is what kept `Page.create`'s gBCR flat per page
+(see "Hypothesis 2: sibling sweeps over `display: none` pages"
+above). Net savings vs the display:none variant was ~22 s render;
+the 1.7 s removeChild cost is roughly 8 % of that win paid back
+to Blink for cleanup. Worth keeping.
+
+### Aside: it's not GC, and JS references don't help
+
+A reasonable follow-up question to all of this is "can we just
+hold a reference to the detached children to avoid disposal,
+or turn off GC to skip the cleanup?" Neither applies to what
+we're measuring.
+
+Chromium maintains two trees:
+
+- **DOM tree** -- `Node` objects, JS-visible, referenceable.
+- **Render tree** -- `LayoutObject` / `LayoutBox` / `LayoutText`
+  etc., Blink-internal, NOT JS-visible.
+
+`removeChild` keeps the DOM Node alive (JS reference holders --
+including the handler's `this._detached` array -- prevent
+collection). But the corresponding LayoutObject in the render
+tree is **destroyed immediately**, synchronously, at the
+removeChild call. Re-attaching via appendChild later builds a
+new LayoutObject from scratch.
+
+There is no JS-level API to keep a LayoutObject alive across
+detach + reattach. Holding DOM references doesn't change the
+render-tree lifecycle. The 1.7 s lives entirely in
+LayoutObject teardown -- which is Blink-internal C++ work
+attributed to the `removeChild` native frame in the profile,
+not to GC.
+
+V8's GC is a separate concern and isn't the bottleneck. The
+profile reads:
+
+```
+   self_ms   self_%   function
+    195.21    0.89%   (garbage collector)
+```
+
+~200 ms over a ~22 s render. Even if it could be disabled
+(it can't -- Node would OOM), it would barely register.
+
+The asymmetry between variants B and E makes this concrete.
+Variant B (graveyard fragment) moves the page from
+`.pagedjs_pages` to a detached DocumentFragment; variant E
+(empty wrapper) keeps the page in `.pagedjs_pages` but moves
+its children out. The fragment-move path *does* trigger
+LayoutObject teardown (you can see the 1.08 ms / call in
+variant B's instrumentation) even though the DOM Node lives on
+in a JS-visible fragment -- because the destination is itself
+not attached to the document, so there's no live render-tree
+parent. Conversely, variant E's wrapper stays in
+`.pagedjs_pages` with a live LayoutObject the whole time, so
+the wrapper's render-tree slot doesn't get torn down; only
+its child LayoutObjects do (as the children move out). The
+"keep render objects alive" idea would have to mean keeping
+the wrapper in `.pagedjs_pages` with all its children, which
+is the display:none baseline -- ~48 s render.
+
+The trade-off is therefore not "keep things alive vs. let GC
+collect them"; it's "be a live render-tree sibling vs. not".
+Anything that keeps the wrapper as a live sibling pays the
+~1.1 us per wrapper-sibling per gBCR call shown above, and the
+gBCR firings compound that into seconds across 1651 pages.
+
+## Chasing the residual `(idle)` to requestAnimationFrame
+
+A second axis of the same investigation. The post-findRef-fix
+profile showed `(idle) 735 ms (4.6 %)` -- not huge, but non-zero
+and worth understanding. `(idle)` in a V8 CPU profile means
+samples taken while the main thread had nothing scheduled --
+waiting on async/await, microtask queue settling, requestAnimationFrame
+ticks, or other browser-internal yields.
+
+### Hypothesis 1: microtask boundaries from `await Hook.trigger(...)`
+
+The chunker's per-page loop has 5-6 `await this.hooks.X.trigger(...)`
+calls per page. `Hook.trigger()` wraps every sync handler in a fresh
+Promise and returns `Promise.all(promises)`, so the caller always
+awaits a thenable -- a microtask boundary per await even when every
+handler resolved synchronously. 5 boundaries × 1651 pages ≈ 8,255
+yields; if each yield is ~85 us in V8 it lines up with the 735 ms.
+
+Patched it: `Hook.trigger()` returns `undefined` when no handler
+returned a thenable, callers do
+`let p = hook.trigger(...); if (p) await p;` to skip the await on
+the sync fast path. Patched at four hot per-page sites (3 in
+`chunker.layout`, 3 in `chunker.handleBreaks`).
+
+Result: render went **up** by ~0.35 s on a 2-run paired A/B
+(14.57 s -> 14.92 s avg). `(idle)` in the profile went **up too**
+(735 ms -> 1223 ms in absolute terms). Microtask boundaries are
+~30 us each at the JIT level; the V8 sampler at 1 ms intervals
+hardly catches them, so they show up as `(program)` rather than
+`(idle)`. The patch shaved microtask scheduling cost in the
+single-digit percent range but added a branch on every Hook.trigger
+call -- net wash, slight regression. **Reverted.**
+
+### Hypothesis 2: ResizeObserver firing per page
+
+Per page, `Page.addResizeObserver` creates a fresh `ResizeObserver`
+that fires its callback asynchronously from the compositor thread
+back to main. The callback wraps work in `requestAnimationFrame`,
+so each RO firing schedules a frame-tick wait. 1651 pages × ~0.5 ms
+per RO-rAF round-trip ≈ ~800 ms. Plausible.
+
+Two-step probe:
+1. **Skip the rAF wrap inside the RO callback**, run synchronously.
+   Result: `(idle) 902 ms`. No improvement, possibly slightly worse.
+2. **Disable the ResizeObserver entirely** (early-return in
+   `addResizeObserver`). Result: `(idle) 1,074 ms`. Still no
+   improvement.
+
+Neither helped. The RO isn't the source -- the per-page
+`addResizeObserver` overhead is real, but it doesn't show up in
+the `(idle)` bucket. Restored upstream behaviour.
+
+### Hypothesis 3: the chunker's `Queue.tick` is `requestAnimationFrame`
+
+The chunker drives its per-page work through a `Queue` class
+(`paged.browser.js:2666`). The queue's constructor sets:
+
+```js
+this.tick = requestAnimationFrame;
+```
+
+and `Queue.run()` schedules each iteration via
+`this.tick.call(window, () => { ... });`. Chunker's `render()`
+loops over `this.q.enqueue(() => this.renderAsync(renderer))`
+once per page. Every per-page iteration therefore waits one rAF
+tick before processing.
+
+`requestAnimationFrame` waits for the next animation frame. In
+headless puppeteer with no display, rAF still delivers callbacks
+on a regular cadence (Chromium's headless mode default is around
+60 Hz off-screen / ~16 ms per frame, with the scheduler often
+batching tighter than that). Either way, per-page rAF waits
+across 1651 pages add up to several hundred milliseconds of pure
+main-thread idle.
+
+The fix is one line:
+
+```js
+this.tick = (cb) => queueMicrotask(cb);
+```
+
+`queueMicrotask` schedules the callback on the microtask queue --
+runs before returning to the event loop, microsecond-scale latency
+instead of millisecond-scale. The `Queue` doesn't depend on rAF
+semantics (no paint coordination, no frame-budget yielding --
+it's just a serializer that wants to run tasks back-to-back).
+
+Verification (paired 2-run A/B, `--detach-pages`, no
+instrumentation, no cpu-profile):
+
+| run | BEFORE render | AFTER render |
+| --- | --- | --- |
+| 1 | 14.62 s | 11.86 s |
+| 2 | 14.51 s | 12.12 s |
+| **avg** | **14.57 s** | **11.99 s** |
+
+**Δ = -2.58 s render (-18 %).** Larger than the 735 ms `(idle)`
+that prompted the look -- because rAF was costing real (program)
+work too (V8 scheduler, microtask queue draining around the rAF
+boundary), not just idle wait. CPU profile of the fixed render:
+
+```
+   self_ms   self_%   function
+   -------   ------   ----------------------------------------------
+   4355.74   34.75%   getBoundingClientRect
+   1935.89   15.45%   removeChild
+   1934.11   15.43%   (program)             (was 5872 -- down ~4 s)
+    636.43    5.08%   removeOverflow
+    -- (idle) absent from the top 10, < 130 ms (1 %)
+```
+
+`(idle)` dropped out of the top 10 (< 130 ms / 1 %), `(program)`
+dropped from 5872 ms to 1934 ms (-4 s), `removeChild` dropped
+slightly (2426 ms -> 1935 ms; smaller render = same per-call cost
+× same call count, so this is sampling artefact, not a real
+change). PDF byte size unchanged (within standard timestamp
+drift). Shipped.
+
+### What the three hypotheses together teach
+
+`(idle)` in a V8 CPU profile attribution table is **not** primarily
+microtask scheduling -- those are too fast to sample. It's
+genuinely-waiting time, where the main thread had no V8 work to do.
+The dominant source of waiting in our render was not async/await,
+not ResizeObserver coalescing, but a `requestAnimationFrame`
+buried in the chunker's task queue. Replacing it with
+`queueMicrotask` collapses the per-page wait, and additionally
+shrinks the surrounding V8 scheduler work because each rAF
+callback came with its own setup / teardown overhead.
+
+The pattern to remember: if a profile shows non-trivial `(idle)`
+in a render-style workload, hunt for explicit `requestAnimationFrame`
+/ `setTimeout` / `requestIdleCallback` calls in the hot path before
+investigating microtask machinery. The frame-paced scheduler is a
+much bigger lever than the microtask scheduler.
+
+### Follow-up: the `Queue` itself was unnecessary indirection
+
+The chunker's `render()` routes each per-page iteration through
+`this.q.enqueue(() => this.renderAsync(renderer))`. The queue's
+job is to serialize tasks -- but an async generator is already
+inherently serial (you can't call `.next()` twice in parallel).
+With the rAF-tick fix above, the queue was reduced to a
+`queueMicrotask` hop plus a Promise/deferred allocation per page,
+for no purpose.
+
+Dropped the indirection: `render()` now iterates `renderer.next()`
+directly. The `Queue` class still exists in the bundle for the
+`onOverflow` re-render path (which is rare in practice), but the
+hot per-page loop bypasses it.
+
+This is a structural simplification more than a measurable speedup
+-- the queueMicrotask hop was already cheap and the deferred
+allocation amortizes. But it removes a layer that was doing
+nothing useful for our use case, which is the point of
+maintaining a fork.
+
+## Stripping headless-irrelevant async machinery
+
+paged.js was designed to be fully usable in interactive browser
+work. The async coordination patterns it carries -- always
+returning Promises from hook triggers, awaiting microtask
+boundaries between every phase, deferring tasks via animation
+frames -- pay off when the same engine is rendering inside a
+visible page that needs to stay responsive, coordinate with the
+compositor, and tolerate handlers that load external resources.
+
+In our headless puppeteer pipeline, none of that is true:
+
+- The page is offscreen; no compositor to coordinate with.
+- We don't care if any individual page-render blocks for tens of
+  milliseconds, because the browser isn't trying to repaint.
+- Every handler we register is synchronous. No hook needs to
+  await anything.
+- The book HTML is loaded before render starts (`page.goto(url,
+  { waitUntil: "load" })`), so every image's `.complete` flag is
+  already true. No image-loading awaits ever actually wait.
+
+Each remaining async wrapper is overhead we pay for a flexibility
+we never use. We're maintaining a task-specific fork; we can keep
+peeling layers as long as the simplifications don't change observed
+output.
+
+### Phase 1: hook fast-path
+
+`Hook.trigger()` upstream always wraps sync handler results in
+`new Promise(resolve => resolve(executing))` and returns
+`Promise.all(promises)`. The chunker's per-page loop awaits each
+of `beforePageLayout`, `afterPageLayout`, and `finalizePage`. With
+all six of our registered handlers running synchronously,
+`await trigger(...)` was a no-work microtask boundary per call.
+
+Patch: `Hook.trigger()` returns `undefined` when no handler
+returned a thenable. Callers in the per-page hot path become:
+
+```js
+let _p = this.hooks.X.trigger(...);
+if (_p) await _p;
+```
+
+The microtask boundary is skipped entirely on the sync fast
+path. Patched at six per-page sites (three in `chunker.layout`,
+three in `chunker.handleBreaks`).
+
+CPU profile comparison (post-queue-tick + drop-queue baseline vs
+post-Phase-1):
+
+| metric | baseline | Phase 1 | Δ |
+| ------ | -------- | ------- | --- |
+| samples | 7,353 | 6,902 | -451 |
+| profile duration | 13.07 s | 12.22 s | **-0.85 s (-6.5 %)** |
+| `getBoundingClientRect` self | 4,622 ms | 4,273 ms | -349 ms |
+| `(program)` self | 1,873 ms | 1,874 ms | flat |
+| `removeChild` self | 1,885 ms | 1,913 ms | flat |
+| `removeOverflow` self | 592 ms | 579 ms | flat |
+| `(idle)` self | n/a (< 130 ms) | n/a (< 130 ms) | flat |
+
+The 451 fewer samples account for ~800 ms of saved CPU work.
+`getBoundingClientRect`'s self-time dropped by ~350 ms; the rest
+is distributed across many small hot spots that all shrank
+slightly because they were each preceded by fewer microtask
+yields. No new hot spot appeared.
+
+> [!NOTE]
+> We compare CPU-profile sample counts and self-times here, not
+> wall-clock. Wall-clock includes I/O variance and system load on
+> the dev machine; CPU profile sample times are independent of
+> those and more reliable for "did this actually change CPU work."
+> Wall-clock numbers from these runs are noted where useful for
+> sanity-checking but aren't the primary signal.
+
+Shipped. The fix is small (one helper change + six call-site
+edits) and removes about 8k microtask boundaries from the
+per-page hot loop on a 1651-page render.
+
+### Phase 2: sync chain end-to-end through the per-page hot path
+
+With Phase 1 in place, every per-page `await` in the chunker is
+unconditional on a function that returned a Promise even when
+nothing was actually awaitable. The structural answer is to make
+those functions plain sync functions.
+
+The chain, top to bottom of the per-page call tree:
+
+```
+chunker.*layout()              (async generator → sync generator)
+  chunker.handleBreaks()       (async → sync)
+  page.layout()                (async → sync)
+    Layout.renderTo()          (async → sync)
+      Layout.waitForImages()   (async → sync, throws if not preloaded)
+chunker.render() loop          (still async at the outer edge;
+                                renderer.next() now sync)
+```
+
+Phase 2 converts each step. The only function that *could* have
+been genuinely async -- `waitForImages` -- is now a synchronous
+check: it walks the supplied `<img>` nodes and throws if any
+isn't `.complete`. In our pipeline,
+`page.goto(url, { waitUntil: "load" })` settles before paged.js
+is invoked, so every image is already loaded; the throw is a
+safety net for pipeline bugs, not a runtime path we expect to
+take.
+
+The hook triggers in the per-page hot path keep the Phase 1
+fast-path semantics but switch from
+`let _p = hook.trigger(...); if (_p) await _p;` to
+`_assertSync(hook.trigger(...), "hook-name")`. The helper throws
+if a handler ever returns a thenable -- the same safety pattern
+as `waitForImages`. None of our shipping handlers do.
+
+Dead code removed in the same pass: `Chunker.renderAsync` and
+`Chunker.renderOnIdle`, both unreachable since the drop-queue
+change above stripped their only caller. Together ~30 lines of
+async machinery that existed only to wrap the (now sync)
+`renderer.next()` call.
+
+CPU profile (Phase 1 baseline vs Phase 2):
+
+| metric | Phase 1 | Phase 2 | Δ |
+| ------ | -------- | ------- | --- |
+| samples | 6,902 | 6,948 | +46 |
+| profile duration | 12.22 s | 12.35 s | +0.13 s (noise) |
+| `getBoundingClientRect` self | 4,273 ms | 4,524 ms | +251 ms (noise) |
+| `(program)` self | 1,874 ms | 1,909 ms | +35 ms |
+| `removeChild` self | 1,913 ms | 1,883 ms | -30 ms |
+| `removeOverflow` self | 579 ms | 523 ms | -56 ms |
+
+Phase 2 sits inside the run-to-run noise band on CPU time --
+the per-call CPU cost of an `await` on an already-settled Promise
+is small (a handful of microseconds), and Phase 1 already
+eliminated most of the boundary count. **What Phase 2 buys is
+not measurable CPU time -- it's structural simplicity.**
+
+Code shape, before and after:
+
+- 6 fewer `async` keywords on hot-path methods.
+- 13 fewer `await` keywords removed from the bodies of those
+  methods (the per-page chain no longer threads `await` through
+  any of its layers).
+- One async generator (`async *layout`) → sync generator
+  (`*layout`).
+- Two dead methods removed (`renderAsync`, `renderOnIdle`).
+- Two `_assertSync` guards added at the chunker's hook call
+  sites + one at `waitForImages` -- the contract we now rely on
+  (per-page handlers all synchronous, every `<img>` preloaded)
+  is enforced at runtime with a useful error message.
+
+PDF output is **byte-identical** to the Phase 1 build on this
+content (`async-phase1/book.pdf` and `async-phase2/book.pdf`
+both 16,893,546 bytes -- a rare 0-byte timestamp drift, but
+the structural content is identical regardless).
+
+This is the kind of cleanup that's only worth doing because
+we maintain a task-specific fork of the bundle. Upstream
+paged.js has to support handlers that await fetches or image
+loads or font measurements -- our pipeline never registers one.
+Removing the async machinery in our copy shrinks the surface to
+reason about and makes the data-flow direct: a render is a
+plain function call that produces a plain return value.
+
+### What's still async, and why
+
+The async machinery that survives this audit is now at the
+once-per-render layer, where it's load-bearing:
+
+- `Chunker.flow()` is async because `loadFonts()` waits on the
+  CSS font-face descriptor's load promise, which is actually
+  async and OS-level.
+- `Chunker.render()` stays `async` as a thin wrapper so callers
+  in `flow()` can `await` it (the alternative would be to
+  remove `async` and have `flow()` not await it, but the call
+  site reads more clearly with the `await` retained).
+- `beforeParsed`, `afterParsed`, `afterRendered` hooks are still
+  awaited with the `await hook.trigger(...)` form because they
+  fire once per render and the overhead is irrelevant.
+- The `onOverflow` recovery path (`Chunker.q.enqueue(async ...)`)
+  re-renders the document if any page overflows after paint. In
+  practice this never fires for our content, but keeping the
+  recovery code intact costs nothing and preserves behaviour for
+  edge cases.
+
+The hot per-page path is now `function`, `function*`, plain
+return values, and a `while` loop. Future work that touches
+this code can reason about it as straight-line synchronous
+flow.
+
+## Doing less work in `Layout.append()`
+
+Picking the next hotspot after the async cleanup, BreakToken
+JSON, gBCR wrapper inline, and UUID-counter changes had all
+landed. Fresh profile from a clean baseline at 100us sampling
+(V8 effectively clamped this to ~543us/sample on this Node/
+Chromium build), `--no-timing --detach-pages`, render-only:
+
+```
+   self_ms   self_%   function  @  source
+   -------   ------   --------------------------------------------------
+   4825.28   38.22%   getBoundingClientRect       (native)
+   2021.89   16.02%   (program)                   (native)
+   1954.01   15.48%   removeChild                 (native)
+    635.95    5.04%   removeOverflow              paged.browser.js
+    288.38    2.28%   wrapContent                 paged.browser.js
+    255.25    2.02%   insertBefore                (native)
+    227.01    1.80%   appendChild                 (native)
+    164.01    1.30%   findOverflow                paged.browser.js
+    140.66    1.11%   (garbage collector)         (native)
+    138.49    1.10%   afterPageLayout             paged.browser.js (Splits)
+    129.25    1.02%   cloneNode                   (native)
+    125.99    1.00%   addRefs                     paged.browser.js
+     90.15    0.71%   renderTo                    paged.browser.js
+     81.46    0.65%   filterTree                  paged.browser.js
+     80.92    0.64%   importNode                  (native)
+     80.38    0.64%   setAttribute                (native)
+     72.77    0.58%   append                      paged.browser.js
+     ...
+```
+
+The four heavy hitters are unchanged from earlier reports.
+`Layout.append` itself shows only 73 ms of self-time, but
+inclusively it owns a large fraction of the per-source-node
+work: `cloneNode`, `appendChild`/`insertBefore`, the
+`findElement` chain (`querySelector` + `getAttribute`), the
+`renderNode` hook dispatch, and `rebuildAncestors` at page
+boundaries all flow through it. With ~100k+ source-node
+clones per render, anything per-call adds up.
+
+Reading the body of `append()`, three things stood out as
+potentially-reducible:
+
+1. The `renderNode` hook dispatch fires for every cloned
+   node. Even if no handler is registered, `triggerSync`
+   still allocates a results array, runs `this.hooks.forEach`
+   over zero entries, and returns the empty array; the
+   caller then runs its own `.forEach` over that empty array.
+2. The `findElement(node.parentNode, dest)` lookup goes
+   through `getAttribute("data-ref")` on the parent. The
+   ref is also set on every source element at decoration
+   time, so the value could be stashed on a plain JS expando.
+3. `clone.dataset.ref` is read a second time at the end of
+   `append()` to register the clone in `dest.indexOfRefs`.
+   Same expando trick applies.
+
+Following the (1) thread first uncovered two separable wins:
+a bug inside the only registered `renderNode` handler, and
+the broader empty-handlers dispatch overhead.
+
+### `Footnotes.renderNode`: always-truthy NodeList condition
+
+The grep for `renderNode` method definitions in the bundle
+returns exactly one match: `Footnotes.renderNode` (in the
+package's footnotes-handling class). Every `append()` call
+goes through it. Its body:
+
+```js
+renderNode(node) {
+    if (node.nodeType == 1) {
+        let notes;
+        if (!node.dataset) return;
+
+        if (node.dataset.note === "footnote") {
+            notes = [node];
+        } else if (node.dataset.hasNotes ||
+                   node.querySelectorAll("[data-note='footnote']")) {
+            notes = node.querySelectorAll("[data-note='footnote']");
+        }
+
+        if (notes && notes.length) {
+            this.findVisibleFootnotes(notes, node);
+        }
+    }
+}
+```
+
+The `else if` condition has an upstream bug: a `NodeList` is
+always truthy (even an empty one -- it's an object), so when
+`dataset.hasNotes` is undefined the right arm of the `||`
+runs `querySelectorAll`, the condition evaluates true, and
+the next line then runs `querySelectorAll` **a second time**.
+Two subtree scans per element-node clone, for any document
+that doesn't author `data-note='footnote'` directly.
+
+`grep -c 'data-note' docs/_site-pdf/book.html` returns 0 --
+every one of those scans on every clone of every page of
+the book was dead work.
+
+The fix narrows the `else if` to the original intent:
+
+```js
+} else if (node.dataset.hasNotes) {
+    notes = node.querySelectorAll("[data-note='footnote']");
+}
+```
+
+Profile delta (post-tojson baseline vs surgical fix):
+
+| metric | baseline | post-fix | Δ |
+| ------ | -------- | -------- | --- |
+| render wall | 12.63 s | 12.63 s | flat (within noise) |
+| `querySelectorAll` self | 67.9 ms | 52.8 ms | -15 ms |
+| samples | 23,313 | 23,250 | -63 |
+
+A small saving in absolute terms: most of the eliminated
+`querySelectorAll` calls were against tiny leaf subtrees
+that terminate in microseconds when no matches are present.
+The bug fix is upstream-clean and correct; the perf-relevant
+takeaway was that *most* of the work `append()` pays for the
+`renderNode` hook is in the dispatch wrapping the handler,
+not in the handler's body. That motivated (2).
+
+### `Hook.triggerSync` empty-handlers fast-path
+
+Mirrors the README's earlier "Phase 1: hook fast-path" for
+the async `trigger()` path. `Hook.triggerSync` previously:
+
+```js
+triggerSync() {
+    var args = arguments;
+    var context = this.context;
+    var results = [];
+    this.hooks.forEach(function (task) {
+        var executing = task.apply(context, args);
+        results.push(executing);
+    });
+    return results;
+}
+```
+
+…and the four reducer call sites in `Layout` always did:
+
+```js
+let r = this.hooks.X.triggerSync(...);
+r.forEach((newVal) => { if (newVal !== undefined) target = newVal; });
+```
+
+Walking the bundle to see which of those four hook arrays
+are actually populated in our build:
+
+| call site | hook | handlers registered |
+| --------- | ---- | ------------------- |
+| `breakAt` (line 1551) | `onBreakToken` | 0 |
+| `append` (line 1640) | `renderNode` | 1 (`Footnotes`) |
+| `findBreakToken` (line 1805) | `onOverflow` | 0 |
+| `findBreakToken` (line 1815) | `onBreakToken` | 0 |
+| `Chunker.flow` (line 2910) | `filter` | 4 |
+
+Three of the four hot sites are dispatching against an empty
+handler array every call. `onOverflow` and the two
+`onBreakToken` sites all fire from the per-page break-
+detection path, which can run more than once per page when
+overflow-and-retry happens.
+
+Patch: `triggerSync` returns `undefined` on the empty path,
+callers guard their reducer `forEach` with a truthy check.
+
+```js
+triggerSync() {
+    if (this.hooks.length === 0) return undefined;
+    // ...existing body
+}
+```
+
+```js
+let r = this.hooks.X.triggerSync(...);
+if (r) r.forEach((newVal) => { ... });
+```
+
+Profile delta (post-surgical vs post-fast-path):
+
+| metric | post-surgical | post-fast-path | Δ |
+| ------ | ------------- | -------------- | --- |
+| render wall | 12.63 s | **12.14 s** | **-0.49 s** |
+| samples | 23,250 | 22,433 | -817 |
+| `getBoundingClientRect` self | 4,819 ms | 4,714 ms | -105 ms |
+| `removeChild` self | 1,962 ms | 1,902 ms | -60 ms |
+| `removeOverflow` self | 634 ms | 552 ms | -82 ms |
+| `querySelectorAll` self | 52.8 ms | 43.4 ms | -10 ms |
+
+The wall-clock drop (~490 ms) and sample drop (817 × 542 us
+≈ 443 ms) line up cleanly, so the saving is real, not run-
+to-run noise. The reductions spread across rows because the
+per-call cost of an empty `triggerSync` -- an array alloc, a
+forEach over zero entries, a return, and the caller's own
+forEach over the returned `[]` -- creates pressure on the
+allocator and the V8 inliner that compounds on the per-page
+hot path even though no single line attributes the cost.
+
+The `renderNode` site at line 1640 does **not** hit the fast
+path in this build -- `Footnotes` still occupies it with one
+handler, so `hooks.length === 1` and the body runs as
+before. The savings come entirely from the three zero-
+handler sites.
+
+### `Footnotes` self-disables when no footnotes are in source
+
+That left the per-element `Footnotes.renderNode` dispatch
+still firing on every cloned node, plus four other hook
+methods `Footnotes` registers via the `Handler` base auto-
+wiring. Inventory of what `Footnotes` is doing on a render
+with zero footnote-marked nodes:
+
+| method | fires | what it does on a footnote-free doc |
+| ------ | ----- | ----------------------------------- |
+| `onDeclaration` | per CSS declaration | quick property-name checks. Cheap. |
+| `renderNode` | per element-node clone | short-circuits after surgical fix. |
+| `beforePageLayout` | once per page | checks `this.needsLayout.length` (always 0). Cheap. |
+| `afterPageLayout` | once per page | **3 `querySelector`s + `getBoundingClientRect` + `new Layout(...)` (which does 2 more `getBoundingClientRect`s + `getComputedStyle` in its constructor) + `findOverflow()` on the footnote-inner-content area.** Real work. |
+| `afterOverflowRemoved` | per overflow detection | `querySelectorAll` returning empty. Cheap-ish. |
+
+The big hidden cost was `afterPageLayout` -- ~1,650 calls per
+render, each measuring an empty footnote area through several
+DOM ops and constructing a transient `Layout` instance whose
+constructor itself does multiple gBCRs.
+
+The detect-and-disable plan:
+
+1. Footnotes is the *only* registrant for each of its hook
+   methods (`onDeclaration` aside -- it's a polisher-time
+   hook with other registrants, but it's also cheap).
+2. By the time `afterParsed` fires, both the CSS-driven
+   selectors (populated by `onDeclaration` calls into
+   `this.footnotes`) and any source-HTML `data-note` markers
+   are accounted for. `Footnotes.afterParsed` already runs
+   `processFootnotes(parsed, this.footnotes)` which writes
+   `data-note='footnote'` on any element matching a CSS
+   selector. So a single `parsed.querySelector(
+   "[data-note='footnote']")` at the end of that pass is
+   conclusive.
+3. If null, splice `Footnotes`'s bound functions back out
+   of each hook array. With the empty-handlers fast-path
+   from (2) already landed, the per-page and per-node
+   dispatches then return `undefined` immediately and
+   callers skip their reducer `forEach`.
+
+To enable (3), the `Handler` base class gets a small
+addition: each `(hook, bound)` pair from auto-registration
+is stashed under its hook name on `this._registered`, and a
+new `_unregisterAll(except)` method splices each entry back
+out. The `except` argument lets the caller skip the hook
+it's currently inside (`afterParsed` in this case) --
+splicing the array we're iterating would cause the
+surrounding `trigger()` loop to skip a sibling handler.
+The skipped entry stays in `this._registered` forever, but
+it's a one-shot anyway: harmless.
+
+`Footnotes.afterParsed` then becomes:
+
+```js
+afterParsed(parsed) {
+    this.processFootnotes(parsed, this.footnotes);
+    if (!parsed.querySelector("[data-note='footnote']")) {
+        this._unregisterAll("afterParsed");
+    }
+}
+```
+
+Profile delta (post-fast-path vs post-self-disable):
+
+| metric | post-fast-path | post-self-disable | Δ |
+| ------ | -------------- | ----------------- | --- |
+| render wall | 12.14 s | **11.77 s** | **-0.37 s** |
+| samples | 22,433 | 21,809 | -624 |
+| **`getBoundingClientRect` self** | **4,714 ms** | **4,198 ms** | **-516 ms** |
+| `removeChild` self | 1,902 ms | 1,898 ms | flat |
+| `(program)` self | 2,022 ms | 2,198 ms | +176 ms |
+| `append` self | 76 ms | 69 ms | -7 ms |
+
+The 516 ms `getBoundingClientRect` drop is exactly the
+`Footnotes.afterPageLayout` cost that the inventory
+predicted -- one gBCR on `noteContent` plus two more in
+the `new Layout(noteArea, ...)` constructor plus internal
+gBCRs from `findOverflow()`, multiplied by ~1,650 pages.
+The `(program)` row growing by 176 ms is V8 reattributing
+work between native and self-time as the dispatch pattern
+changes; not new work, just a different breakdown.
+
+PDF output remained byte-identical to the previous build
+on this content (16.1 MB, same checksum on the raw
+Chromium output).
+
+### `Layout.append` parent-lookup cache
+
+When the source walker emits consecutive children of the
+same parent, `findElement(node.parentNode, dest)` in
+`append()` gets called repeatedly with the same input.
+For a parent with N children that's N - 1 redundant
+lookups -- each one cheap (`getAttribute("data-ref")` +
+`dest.indexOfRefs[ref]` is an O(1) dict hit on the fast
+path), but the call count is north of 100k per render.
+
+Patch: a three-property memo on `Layout` -- last
+`srcParent`, last `dest`, last `destParent`. Hit check at
+the top of `append`, writeback at the bottom after the
+parent is resolved (whether via direct lookup or via the
+rebuild-ancestors branch, since the rebuild attaches the
+cloned ancestor into `dest`).
+
+Invalidation: reset all three at the top of every
+`renderTo`. The cache is safe within a single `renderTo`
+loop because `append()` never detaches DOM from `dest`,
+and `removeOverflow` (the one thing that does) only fires
+at loop exit. Across `renderTo` calls on the same `Layout`
+instance the previous run's `removeOverflow` may have
+detached the cached parent, so the explicit reset is the
+correctness guard.
+
+Profile delta (post-self-disable vs post-parent-cache):
+
+| metric | post-self-disable | post-parent-cache | Δ |
+| ------ | ----------------- | ----------------- | --- |
+| render wall | 11.77 s | 11.72 s | flat (within noise) |
+| samples | 21,809 | 21,688 | -121 (~65 ms) |
+| `(program)` self | 2,198 ms | 2,169 ms | -29 ms |
+| `getAttribute` (native) | 43 ms | off-list (<40 ms) | -3 ms+ |
+| `querySelector` (native) | 63 ms | 59 ms | -4 ms |
+| `Layout.append` self | 69 ms | 70 ms | flat |
+
+Order ~50-100 ms saved depending on the row chosen, fully
+below the run-to-run wall-clock noise band but visible in
+the cpuprofile rows. The math checks: ~100k append calls
+× ~80 % sibling-cache-hit rate × ~1 us per skipped
+findElement ≈ 80 ms.
+
+PDF output byte-identical.
+
+### What didn't land: the `_ref` expando
+
+One sibling candidate to the parent-lookup cache was
+tried and reverted. The idea: mirror `data-ref` onto a
+plain JS property `_ref` at decoration time (in
+`ContentParser.addRefs`), propagate via the `cloneNode`
+helper, and read it in `findElement` and `append`'s
+postlude instead of `getAttribute("data-ref")` /
+`clone.dataset.ref`. Both reads in the hot path become
+plain JS property loads instead of going through C++ DOM
+attribute fetches or the `DOMStringMap` proxy.
+
+Measured win on the per-row breakdown:
+
+- `Layout.append` self 69 -> 47 ms (-22 ms).
+- `getAttribute` native 43 ms -> off-list (-3+ ms).
+
+About 25 ms of real per-call work removed. Reverted: the
+saving is genuinely smaller than the diff's surface --
+`cloneNode` helper has to propagate an extra property,
+the `data-ref` attribute has to stay for CSS selectors
+and the `querySelector` fallback in `findRef`, `findElement`
+needs a `||` fallback to keep direct `.cloneNode()`
+callers in `rebuildAncestors` working unchanged, and any
+future code that wants the ref has two places it could
+read from. Not worth maintaining for a saving that
+doesn't move single-run wall-clock.
+
+Lesson worth carrying forward: at this point in the
+codebase, per-call findElement / `dataset.ref` work has
+been ground down close enough to its floor that any
+further shave produces savings in the 20-50 ms band, well
+below the run-to-run wall-clock noise on this machine.
+Reading the cpuprofile per-row deltas is the only way to
+tell whether such a change is genuine; reading wall-clock
+isn't. And the bar for landing scales with the size of
+the diff -- the parent-cache landed because it's three
+property writes and one branch; the expando didn't
+because it's a propagation pattern that ripples through
+the bundle.
+
+### Cumulative effect
+
+Across all four landings:
+
+| metric | pre-investigation | post-parent-cache | Δ |
+| ------ | ----------------- | ----------------- | --- |
+| render wall | 12.63 s | 11.72 s | **-0.91 s (-7.2 %)** |
+| samples | 23,313 | 21,688 | -1,625 |
+| `getBoundingClientRect` self | 4,825 ms | 4,194 ms | -631 ms |
+| `removeChild` self | 1,954 ms | 1,897 ms | -57 ms |
+| `removeOverflow` self | 636 ms | 583 ms | -53 ms |
+| `getAttribute` (native) | ~125 ms* | off-list (<40 ms) | -85 ms+ |
+
+\* Inferred from the post-tojson baseline rank; not
+explicitly tabulated in the top-25 cut at that time.
+
+The `Handler._registered` + `_unregisterAll(except)` plumbing
+is reusable: any future handler that determines at
+parse/decoration time that it has nothing to do for a given
+render can self-disable the same way, and the
+empty-handlers fast-path will swallow the per-call dispatch
+cost for free. That's the pattern this work leaves behind --
+combine "detect once at a known-quiet point" with "remove
+yourself from the dispatch chain" and you pay zero
+ongoing cost for inactive handlers.
+
+## Skipping the `wrapContent` innerHTML round-trip
+
+The post-append-cache profile's 5th-largest JS row was
+`wrapContent` at 260 ms. It's called once per render, right
+at the top of `Chunker.flow`, so unlike the previous fixes it
+has no per-page hot path -- the absolute size is the whole
+story.
+
+`Layout.wrapContent` lifts the entire `<body>` into a
+`<template data-ref='pagedjs-content'>` so the chunker can
+iterate the source without disturbing the live DOM. Original:
+
+```js
+template.innerHTML = body.innerHTML;
+body.innerHTML = "";
+body.appendChild(template);
+```
+
+Two heavy halves, both linear in document size:
+
+1. **`body.innerHTML` getter**: walks every node in the body
+   and serialises the entire subtree to one HTML string.
+2. **`template.innerHTML = ...` setter**: hands the string to
+   the HTML parser, which reparses it into a fresh tree
+   inside the template's contents-owner document.
+
+On our 5.5 MB book, the round-trip is exactly 260 ms.
+`find-callees.mjs` confirms 99 % of that lives in the JS frame
+itself (the C++ serialiser/parser get attributed back to the
+calling frame, same trick `removeOverflow`'s `Range`
+deletion uses):
+
+```
+wrapContent: self=259.97ms, total=262.15ms (callees=2.18ms)
+per direct callee (subtree total ms):
+      2.18 ms   querySelector  @  (native):0
+```
+
+The fix moves children directly into a plain
+`DocumentFragment`, no string round-trip:
+
+```js
+let fragment = document.createDocumentFragment();
+while (body.firstChild) fragment.appendChild(body.firstChild);
+template = document.createElement("template");
+template.dataset.ref = "pagedjs-content";
+template._pagedjsContent = fragment;  // re-entrancy stash
+body.appendChild(template);
+return fragment;
+```
+
+### Why a plain fragment, not `template.content`
+
+The first cut moved children into the template's content,
+which is the obvious shape since `wrapContent` was already
+returning `template.content`. It crashed on the first page:
+
+```
+paged.js (forked): image not loaded at render time.
+Image: file:///.../Features/Images/b0724fe2-....png
+   at Layout.waitForImages
+   at Layout.renderTo
+```
+
+The reason is in the spec. A `<template>`'s `content` fragment
+is owned by a separate "template contents owner document"
+that has no browsing context -- resources inside it never
+load. Moving a live `<img>` into `template.content` triggers
+`adoptNode` to that inert document, which then runs the
+"update the image data" algorithm, creates a fresh request
+in state "unavailable", and flips `.complete` to false. The
+source image is now stuck in that state; clones into the live
+page wrappers inherit it without the synchronous cache-hit
+path firing in time for the sync `[PATCH: assert-sync]`
+`waitForImages` check.
+
+The `innerHTML` round-trip avoids this incidentally: the
+freshly-parsed `<img>` elements in `template.content` are
+brand new (never live), they have no prior load state to
+disturb, and when their clones land in the live page wrappers
+Chromium's file:// cache lookup resolves them synchronously.
+
+A plain `DocumentFragment` is owned by the live document.
+Moving children into it is a same-document append -- no
+adoption, no "update the image data", no `.complete` reset.
+Clones from the fragment into the live page wrappers then
+take the same fast cache path the round-trip's parsed images
+did.
+
+### Re-entrancy
+
+The original returned `template.content`, so a second call
+finding the existing template just returned that same
+fragment. Under the move strategy `template.content` is
+empty (the children live in the plain fragment we returned),
+so the re-entrant branch reads the fragment back off a
+`template._pagedjsContent` expando on the marker template.
+Functionally equivalent for the one-call-per-render case
+that's actually exercised; preserves the multi-call contract
+in case anyone leans on it later.
+
+### Results
+
+Paired A/B, 2 runs each, `--detach-pages --no-timing
+--cpu-profile --cpu-sampling 100`:
+
+| run | pre | post |
+| --- | --- | --- |
+| 1 | 11.92 s | 10.72 s |
+| 2 | 11.60 s | 11.06 s |
+| **avg** | **11.76 s** | **10.89 s** |
+
+**Δ = -0.87 s render (-7.4 %).** Larger than the 260 ms the
+profile attributed to `wrapContent` itself -- the round-trip
+also allocated a transient 5.5 MB string that pushed GC and
+distributed sample noise into the surrounding rows; removing
+the allocation relieves pressure across the whole per-page
+hot path. The cpuprofile rows breakdown:
+
+| function | pre | post | Δ |
+| -------- | --- | ---- | --- |
+| `wrapContent` self | 260 ms | off-list (<25 ms) | **-260 ms+** |
+| `getBoundingClientRect` self | 4,281 ms | 4,036 ms | -245 ms |
+| `removeOverflow` self | 560 ms | 353 ms | -207 ms |
+| `removeChild` self | 1,871 ms | 1,730 ms | -141 ms |
+| `(program)` self | 2,298 ms | 2,152 ms | -146 ms |
+
+The `wrapContent` row is the only one outside the single-run
+noise band (the README's earlier methodology section pins
+that at 50-150 ms for sub-1 % rows on this machine). The
+others are plausibly real but inseparable from noise without
+more runs; the sample-count delta (-2,100 samples × 542 us
+= ~1,135 ms) matches the wall-clock delta closely enough that
+the distributed component is probably real GC-pressure
+relief, not just sampler jitter.
+
+PDF byte-equivalent to the pre-fix build (16.1 MB).
+
+### What the pattern leaves behind
+
+`removeOverflow` and `wrapContent` are both cases where V8
+rolled native DOM work (`Range.deleteContents`,
+HTML serialiser+parser) into the calling JS frame's
+self-time. The diagnostic move is the same one we used for
+gBCR attribution: `find-callees.mjs` on the suspect frame.
+If self-time is ~100 % of total, the work is happening
+inside a native callee the sampler didn't name -- read the
+JS body to find which DOM API is doing the work and whether
+it can be replaced with a cheaper equivalent.
+
+`find-callees.mjs` was added for this investigation and
+sits alongside `find-callers.mjs`; the two together cover
+both directions of the V8 attribution edge.
+
+## The per-page overflow-check rhythm: two bugs in the adaptive `maxChars`
+
+The "Attempt E: additive backoff" section above describes
+the per-page rhythm of `renderTo`'s overflow checks: append
+nodes, fire `findBreakToken` every `maxChars` chars of
+appended content, break out when it returns a non-null
+breakToken. `maxChars` defaults to 1500 and is meant to
+adapt up or down based on observed page capacity.
+
+The post-wrapContent profile showed `findOverflow` total
+2.24 s, almost all of it (1.96 s) in `hasOverflow`'s single
+gate gBCR -- one call per `findBreakToken`. Was the call
+count high because the page actually needs that many
+probes, or was the rhythm wrong?
+
+Instrumenting with `window.__breakCheckStats` and
+`window.__layoutMaxChars` answered it:
+
+```
+findBreakToken checks: 7,764  hits: 862  nulls: 6,902
+renderTo calls: 1651  checks/call avg: 4.70
+Layout.maxChars: first=1500  median=177  last=177  min=177  max=1500
+```
+
+Four findings:
+
+1. **89 % of checks (6,902 / 7,764) return null.** They're
+   "no overflow yet, keep appending" probes. Each is still
+   a full layout-flush gBCR. The actual overflow detections
+   are 862, slightly more than half of the 1651 pages
+   (the rest end naturally, or via CSS-driven breaks).
+
+2. **`Layout.maxChars` was locked at 177 for the entire
+   render** after page 1. That's an order of magnitude
+   below a typical page's capacity (which the @page CSS,
+   font size, and content density determine -- closer to
+   4000-4500 chars of body text on this book). Page 1 ran
+   with the default 1500; pages 2-1651 ran with 177.
+
+3. The reason was a propagation gate in `Page.layout`:
+   ```js
+   if (!settings.maxChars && maxChars) {
+       settings.maxChars = maxChars;
+   }
+   ```
+   `settings` is shared across all pages (one object, set
+   by reference in the Chunker constructor). The chunker
+   maintains a running estimate in `this.maxChars` via
+   `recordCharLength` and passes it into each page's
+   `layout(..., maxChars)`. But `!settings.maxChars` is
+   only truthy on the first page that gets a defined value
+   -- the rest see settings.maxChars already populated and
+   skip the update. Whatever value page 2 picked up (177,
+   from a freak short page 1 that had been recorded as
+   capacity), every subsequent page kept.
+
+4. The recording itself is biased. `recordCharLength` pushes
+   `page.wrapper.textContent.length` after every layout and
+   averages the last 4 values. Short pages -- chapter
+   endings, part dividers -- get recorded alongside full
+   pages, dragging the average well below true capacity.
+   Even with propagation fixed, the average would land
+   around 1200, not 4500.
+
+### The fix
+
+Two patches in `docs/lib/paged.browser.js`, marked
+`// [PATCH: maxChars-propagate]` and `// [PATCH: maxChars-
+running-max]`:
+
+1. **`Page.layout`'s gate drops the staleness check**:
+   `if (maxChars) settings.maxChars = maxChars;`. Each page
+   now picks up the chunker's current estimate.
+
+2. **`Chunker.recordCharLength` tracks the running max over
+   the last 16 pages** instead of the running average over
+   4. Max biases toward "the largest page recently seen,"
+   which approximates true capacity for our content. Short
+   pages still get pushed into the window but don't pull
+   the estimate down. The window of 16 is wide enough that
+   a transient stretch of short pages doesn't collapse the
+   estimate before a full page restores it.
+
+### Results
+
+Paired A/B, 2 runs each, `--detach-pages --no-timing`, no
+profiling:
+
+| run | pre | post |
+| --- | --- | --- |
+| 1 | 10.08 s | 8.15 s |
+| 2 | 11.86 s | 7.98 s |
+| **avg** | **10.97 s** | **8.07 s** |
+
+**Δ = -2.90 s render (-26 %).** CPU profile (single run,
+within noise band on the smaller rows):
+
+| metric                   | pre        | post       | Δ |
+| ------------------------ | ---------- | ---------- | --- |
+| `findOverflow` total     | 2,236 ms   | 1,690 ms   | **-546 ms** |
+| ↳ `hasOverflow` total    | 1,957 ms   | 1,597 ms   | -360 ms |
+| ↳ ↳ `gBCR` native        | 1,945 ms   | 1,587 ms   | -358 ms |
+| ↳ `findOverflow` self    | 142 ms     | 47 ms      | -95 ms |
+| ↳ walker-loop callees    | ~135 ms    | ~46 ms     | -89 ms |
+| `removeOverflow` self    | 353 ms     | 122 ms     | **-231 ms** |
+| `removeChild` self       | 1,731 ms   | 1,637 ms   | flat (noise) |
+| `(program)` self         | 2,152 ms   | 2,215 ms   | flat (noise) |
+
+The `removeOverflow` drop was the surprise. Going in, the
+concern was that bigger `maxChars` (now ~4500 instead of
+177) would mean larger overshoot when overflow fired -- so
+`extractContents` / `deleteContents` would have more nodes
+to detach. The opposite happened: `removeOverflow` self
+dropped two-thirds. The reason is the call count, not the
+per-call size. With `maxChars=177` the renderTo loop
+checked at every 177-char interval, but many of those
+checks were *near* the page boundary, where the walker in
+`findOverflow` did real work even when returning null
+(walking nodes to test text-break candidates that don't
+quite fit). With `maxChars=4500`, the very first check on
+most pages fires right at the overflow point; the walker
+runs once per page instead of several times, and the per-
+call work it does is roughly the same as before.
+
+PDF output is byte-identical to the pre-fix build
+(16.1 MB, same checksum on the raw Chromium output).
+
+### Why the average was the wrong statistic
+
+The textbook reason to track a running average is to
+estimate a stationary quantity in the presence of noise.
+The thing being estimated here -- "how many chars fit on a
+full page" -- is a tight ceiling, not a noisy reading: each
+page's textContent.length either equals page capacity
+(because the page broke for overflow) or is well below it
+(because content ran out / a CSS break fired). The
+distribution is bimodal, and the average sits between the
+modes -- exactly where it's worst as an estimator of
+either.
+
+The running max, by contrast, finds the upper mode and
+sticks to it. It only moves down if the entire window is
+sub-capacity pages, which means the document genuinely
+doesn't have full pages anymore (end of book, perhaps), at
+which point the estimate doesn't matter much.
+
+### Where this leaves the picture
+
+Render is now ~8 s on the 1651-page book, down from ~11 s
+post-wrapContent, down from ~104 s in the original
+baseline. Updated cumulative table:
+
+| fix                                 | render saved | shipped |
+| ----------------------------------- | ------------ | ------- |
+| `--detach-pages` (display:none)     |   ~55 s      | yes     |
+| aggressive detach (`removeChild`)   |   ~22 s      | yes     |
+| `renderTo` additive backoff         |   ~4.25 s    | yes     |
+| skip dead `findEndToken` path       |   ~3.5 s     | yes     |
+| `findRef` fast-path                 |   ~2.4 s     | yes     |
+| queue-tick: rAF -> queueMicrotask   |   ~2.6 s     | yes     |
+| `finalizePage` micro-optimisations  |   ~3 s       | yes     |
+| `wrapContent` move (skip innerHTML) |   ~0.9 s     | yes     |
+| **`maxChars` propagation + max**    | **~2.9 s**   | **yes** |
+| (others, smaller)                   |   ~3 s       | yes     |
+
+The strategic conclusion at the bottom of "Where this
+leaves the picture" updates accordingly: render is now
+roughly half the size of generate (~8 s vs ~32 s wall on
+the production build), and `pageRanges` sharding remains
+the only knob with a profile target large enough to move
+the wall-clock total meaningfully -- and that target is
+generate, not render.
+
+## What happened when we tried move-not-clone
+
+A fresh `--detach-pages --no-timing --cpu-profile
+--cpu-sampling 100` baseline run showed `cloneNode` at
+~146 ms self-time, all of it inside `Layout.append`'s per-
+source-node clone path. `Layout.append`'s body for the
+`!shallow` (deep-cloned leaf) yields was:
+
+```js
+let clone = cloneNode(node, !shallow);  // deep clone
+// ... attach clone to dest ...
+return clone;
+```
+
+The user's question: source's read-only-template contract
+is just an artifact of paged.js's break-and-resume model.
+We're doing offline layout -- nothing reads source after
+the render finishes. Could we MOVE the source node into
+dest instead of cloning it, and avoid the allocation cost
+entirely? Best-case ceiling estimated at ~300-450 ms /
+~3-5 % of render (the cloneNode self plus distributed GC-
+pressure relief from not allocating ~250 k duplicate DOM
+nodes).
+
+### What the refactor required
+
+Three load-bearing assumptions in the chunker break the
+moment source is mutated:
+
+1. The walker traverses via live links
+   (`node.firstChild` / `nextSibling` / `parentNode`).
+   After a leaf yield, `walker = walk$2(nodeAfter(node,
+   source), source)` reads `nodeAfter` AFTER `append` has
+   moved `node` into dest -- the reads now go into dest's
+   tree, not source's. Fix: capture `nodeAfter(node,
+   source)` BEFORE the append call and pass it to the
+   walker reset.
+
+2. `BreakToken.node` stores a source-tree reference for
+   the next page's `getStart(source, breakToken)` to
+   resume from. `createBreakToken`'s four
+   `findElement(*, source)` call sites map rendered
+   (clone) nodes back to source via shared `data-ref`.
+   With moves, source has lost the leaves and findElement
+   returns the moved node now living in dest. Fix:
+   bypass `createBreakToken` entirely. Compute the
+   resume point from the extract-and-restore step
+   instead (see `restoreOverflow` below).
+
+3. `removeOverflow`'s `deleteContents` would drop the
+   moved content forever. In the clone model that was
+   fine -- source still held a pristine copy. In the
+   move model, source needs the overflow content back so
+   the next page can render it. Fix: replace with
+   `restoreOverflow` -- `extractContents` the overflow
+   range, walk the fragment depth-first collecting leaf
+   elements, and reinsert each leaf at its stashed
+   `_srcParent` / `_srcNextSibling` position. For the
+   boundary leaf that's partially overflowing,
+   `extractContents` produces a shallow clone of the
+   leaf in the fragment; we inherit its source position
+   via `source.indexOfRefs[ref]` (which still points at
+   the original-now-in-dest, which carries the stash).
+   Reverse-order iteration so each leaf's `_srcNextSibling`
+   target is back in source by the time we insert.
+
+### The bug that taught the real story
+
+First pass rendered the book to 1740 pages -- 89 more
+than the 1651-page baseline. Content was byte-identical
+modulo timestamps. Per-page char counts in the FAQ
+section showed pages 127+ with only ~50-500 chars each:
+
+```
+[BL p127] 3045 chars      [EX p127] 438 chars
+[BL p128] 3732 chars      [EX p128] 185 chars
+```
+
+Some FAQ pages had a single short paragraph. Instrumenting
+`shouldBreak` revealed it was returning true on every
+non-first yield inside the FAQ article:
+
+```
+[instrument] shouldBreak true: tag=P  ref=6bv pba=- prevNode=ARTICLE
+[instrument] shouldBreak true: tag=B  ref=6bx pba=- prevNode=ARTICLE
+[instrument] shouldBreak true: tag=P  ref=6by pba=- prevNode=ARTICLE
+... (one per FAQ paragraph)
+```
+
+The `<p>` elements have no `data-break-before` and no
+`data-previous-break-after`, so the fire is via
+`needsPageBreak(node, previousNode)` -- which checks
+whether `node`'s effective `data-page` differs from
+`previousNode`'s.
+
+`previousNode` is computed via
+`nodeBefore(node, limiter)`, which walks
+`node.previousSibling` then climbs via `parentNode` if
+no significant sibling exists. In the move model, after
+the previous yield was moved out of source, the current
+yield's `previousSibling` is `null` (the previous one no
+longer lives in source). The climb continues up:
+FAQ article (no `data-page`) -> looks at its previous
+sibling -> finds the **part-divider article** sitting
+right before the FAQ article in source, which DOES carry
+`data-page="divider"` (set by processBreaks for the CSS
+`page: divider;` rule on `article.part-divider`).
+
+So `needsPageBreak` saw a transition from
+`page="divider"` to (effectively) no page, fired true,
+and the chunker started a fresh page for every paragraph
+in the FAQ section. The chapter article's normal
+"siblings share the same effective page-name" property
+broke because the sibling-walk now escapes the chapter
+into the prior part-divider.
+
+### Fix: track previousLeaf in renderTo
+
+The chunker already knows the right answer: the last
+leaf it actually appended this page. Threaded through
+`shouldBreak` as a third argument, used by the
+`needsPageBreak` branch only (`needsBreakBefore` and the
+`parentBreakBefore` logic still use `nodeBefore`):
+
+```js
+let _moveLastLeaf = null;
+// ... in the loop ...
+if (hasRenderedContent &&
+    this.shouldBreak(node, start, _moveLastLeaf)) { ... }
+// ... after append ...
+if (!shallow) _moveLastLeaf = node;
+```
+
+In `shouldBreak`:
+
+```js
+let pageBreakRef = previousLeaf || nodeBefore(node, limiter);
+return ... || needsPageBreak(node, pageBreakRef);
+```
+
+With that, page count went 1740 -> 1653 (within 2 of
+baseline) and per-page content matched. PDF
+byte-equivalent to baseline within timestamp drift.
+
+### Profile diff
+
+Both runs `--detach-pages --cpu-profile --cpu-sampling
+100`, sample-time absolute, single run each (wall-clock
+on this machine is too noisy to be a useful signal --
+see "Methodology: compare profiles, not wall-clock"
+above):
+
+| function | baseline | move | Δ |
+| --- | --- | --- | --- |
+| `getBoundingClientRect` | 3539 ms | 4036 ms | **+497** |
+| `appendChild` | 137 ms | 390 ms | **+253** |
+| `restoreOverflow` (new) | -- | 168 ms | +168 |
+| `removeChild` | 1536 ms | 1635 ms | +99 |
+| `insertBefore` | <50 ms | 87 ms | ~+87 |
+| `getNodeWithNamedPage` | <50 ms | 108 ms | ~+85 |
+| `afterPageLayout` (AtPage) | 105 ms | 182 ms | +77 |
+| `(program)` | 2196 ms | 2266 ms | +70 |
+| `Layout` ctor | 23 ms | 31 ms | +8 |
+| `cloneNode` | 146 ms | <130 ms | **-146** |
+| `removeOverflow` | 124 ms | -- (replaced) | -124 |
+| **samples** | **17,481** | **19,590** | **+2,109** |
+| **CPU work** | **9.48 s** | **10.74 s** | **+1.26 s** |
+
+Net **+1.26 s of CPU work** -- the change is a clear
+regression in the opposite direction from the prediction.
+
+### Why the prediction was wrong
+
+The cloneNode self-time saving (-146 ms) shows up as
+expected, but three structural costs dwarf it:
+
+1. **`appendChild` on an attached node is roughly 2x
+   the cost of `appendChild` on a fresh clone (+253 ms).**
+   A move is internally detach-from-source-parent +
+   attach-to-dest-parent; both touch Blink's child-list
+   bookkeeping. cloneNode produces an unparented node,
+   so the subsequent attach is one-sided. Intrinsic to
+   any move-based design -- no implementation choice
+   avoids it.
+
+2. **Each move dirties Blink's layout state more than
+   each clone does, distributing cost into gBCR
+   (+497 ms).** The increase is spread across every
+   gBCR call site -- `Page.create` (+225 ms),
+   `hasOverflow` (+152 ms), `Layout` ctor (+58 ms),
+   `afterPageLayout` (+31 ms), `addResizeObserver`
+   (+31 ms) -- not localized to any new code. Each
+   gBCR call flushes pending mutations; with every move
+   counting as two mutations vs one for clone+append,
+   each flush has more to do. Same migration pattern
+   the README's "Attempt B: memoize `Page.create`'s gBCR"
+   documented above -- DOM mutation cost doesn't go
+   away by elimination, it migrates to whichever frame
+   next forces a layout flush.
+
+3. **The extract-and-restore cycle adds ~340 ms of new
+   JS work.** `restoreOverflow` (168 ms) builds an
+   `extractContents` fragment + walks it for leaves +
+   inserts each back into source. `previousLeaf` makes
+   `shouldBreak` call `getNodeWithNamedPage` (108 ms)
+   on every leaf yield (it climbs parent chains looking
+   for `data-page`). `insertBefore` (87 ms) is the
+   per-restore reinsertion.
+
+The deeper structural reason: paged.js's break-and-
+resume model touches each source leaf O(pages-spanning-
+that-leaf) times in the move model -- moved into page N,
+extracted to the fragment, reinserted into source,
+moved into page N+1. Each touch is a DOM mutation. The
+clone model touches each node O(1) times -- allocated
+once, attached, thrown away with the page. Cumulative
+mutation count is structurally higher under moves.
+
+The cloneNode time the profile attributes to its native
+frame is just the *allocator* portion of cloning work --
+not the total cost of "duplicating a subtree". The rest
+hides in V8 / Blink native frames not labeled
+`cloneNode`, and that rest doesn't disappear when you
+switch to moves; it shows up as appendChild +
+invalidation cost instead.
+
+### Where this leaves the picture
+
+Reverted. The cumulative table from the previous
+section is unchanged. No row added.
+
+The pattern this attempt taught is the inverse of the
+"distributed savings often exceed direct estimates"
+heuristic the README documents elsewhere: sometimes a
+change with a direct cost saving has bigger distributed
+*regressions* that aren't visible until you measure.
+The cloneNode saving was real; the appendChild + gBCR +
+restoreOverflow overhead was bigger.
+
+The only design that would avoid all three costs is one
+that never re-moves the same node -- a single-pass
+paginator with no break-and-resume. That's not paged.js;
+it's a different algorithm. Not a small refactor.
+
+The buffer variant (pre-clone source once at startup,
+move from buffer to dest) was considered and not
+prototyped: it'd shift the cloneNode allocation cost to
+one big startup call but every per-page move would
+still hit the same appendChild + gBCR dynamic that ate
+the savings here. No structural win.
+
+This experiment also clarifies why the "Profiling
+pdf-lib's load" and "Findings: removeChild" sections
+saw allocation savings show up as wall-clock gains:
+those operations didn't have a Blink layout-tree
+mutation step downstream. Mutations are where the cost
+that *looks* like JS allocation actually lives in this
+codebase.
diff --git a/perf/analyze-heap-profile.mjs b/perf/analyze-heap-profile.mjs
new file mode 100644
index 00000000..b8d17c94
--- /dev/null
+++ b/perf/analyze-heap-profile.mjs
@@ -0,0 +1,84 @@
+// Bottom-up heap sampling profile analyzer.
+//
+// Reads a V8 .heapprofile (the JSON returned by CDP's
+// HeapProfiler.stopSampling) and prints the top allocation sites by
+// self-bytes, aggregated by (function name + source location). Same
+// shape as Chrome DevTools' Memory tab "Allocation sampling"
+// bottom-up view, but in the terminal.
+//
+// Usage:
+//   node analyze-heap-profile.mjs <path/to/render.heapprofile> [--top N] [--min-pct P]
+//
+// Defaults: --top 30, --min-pct 0.1 (hide rows under 0.1% self-bytes).
+//
+// .heapprofile schema:
+//   head: { callFrame, selfSize, id, children: [...] }       (tree of nodes)
+//   samples: [{ size, nodeId, ordinal }]                     (allocation events)
+// Each node's `selfSize` is the sum of bytes from samples whose
+// nodeId targeted that node directly (i.e. that node was the top of
+// the allocation stack). Same shape as cpuprofile self-time.
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const args = process.argv.slice(2);
+let profilePath = null;
+let topN = 30;
+let minPct = 0.1;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--top') topN = parseInt(args[++i], 10);
+  else if (a === '--min-pct') minPct = parseFloat(args[++i]);
+  else if (!profilePath) profilePath = a;
+}
+if (!profilePath) {
+  console.error('usage: node analyze-heap-profile.mjs <path> [--top N] [--min-pct P]');
+  process.exit(2);
+}
+profilePath = resolve(process.cwd(), profilePath);
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+
+// Flatten the tree into a list of nodes, keyed by call-frame.
+const byKey = new Map();
+let totalBytes = 0;
+const walk = (node) => {
+  const cf = node.callFrame || {};
+  const fn = cf.functionName || '(anonymous)';
+  const url = cf.url || '';
+  const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+  const key = `${fn}  @  ${url || '(no url)'}:${line}`;
+  const cur = byKey.get(key) || { bytes: 0, fn, url, line };
+  cur.bytes += node.selfSize || 0;
+  byKey.set(key, cur);
+  totalBytes += node.selfSize || 0;
+  for (const c of node.children || []) walk(c);
+};
+walk(profile.head);
+
+const rows = [...byKey.values()]
+  .map(r => ({
+    ...r,
+    pct: 100 * r.bytes / totalBytes,
+  }))
+  .sort((a, b) => b.bytes - a.bytes)
+  .filter(r => r.pct >= minPct)
+  .slice(0, topN);
+
+const fmtBytes = (n) => {
+  if (n >= 1024 * 1024) return (n / 1024 / 1024).toFixed(2) + ' MB';
+  if (n >= 1024) return (n / 1024).toFixed(1) + ' KB';
+  return n + ' B';
+};
+const fmtPct = (n, w) => n.toFixed(2).padStart(w);
+console.log(`profile: ${profilePath}`);
+console.log(`samples: ${profile.samples ? profile.samples.length : '?'}  total selfSize: ${fmtBytes(totalBytes)}`);
+console.log(`top ${topN} by self-bytes (min ${minPct}%):`);
+console.log('');
+console.log('   self_bytes   self_%   function  @  source');
+console.log('   ----------   ------   ----------------------------------------------');
+for (const r of rows) {
+  const where = `${r.url ? r.url.replace(/^file:\/\/\//, '') : '(no url)'}:${r.line}`;
+  const fn = r.fn || '(anonymous)';
+  console.log(`  ${fmtBytes(r.bytes).padStart(11)}   ${fmtPct(r.pct, 5)}%   ${fn}  @  ${where}`);
+}
diff --git a/perf/diff-heap-profile.mjs b/perf/diff-heap-profile.mjs
new file mode 100644
index 00000000..0b229442
--- /dev/null
+++ b/perf/diff-heap-profile.mjs
@@ -0,0 +1,52 @@
+import { readFileSync } from 'node:fs';
+
+function loadByFn(p) {
+  const profile = JSON.parse(readFileSync(p, 'utf8'));
+  const byKey = new Map();
+  let total = 0;
+  const walk = (n) => {
+    const cf = n.callFrame || {};
+    const fn = cf.functionName || '(anonymous)';
+    const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+    const url = (cf.url || '').replace(/^file:\/\/\//, '');
+    const tail = url ? url.split(/[\\/]/).pop() : '';
+    const key = tail ? fn + ' @ ' + tail + ':' + line : fn;
+    byKey.set(key, (byKey.get(key) || 0) + (n.selfSize || 0));
+    total += n.selfSize || 0;
+    for (const c of n.children || []) walk(c);
+  };
+  walk(profile.head);
+  return { byKey, total, samples: profile.samples ? profile.samples.length : 0 };
+}
+
+const [prePath, postPath] = process.argv.slice(2);
+const pre = loadByFn(prePath);
+const post = loadByFn(postPath);
+const keys = new Set([...pre.byKey.keys(), ...post.byKey.keys()]);
+const rows = [];
+for (const k of keys) {
+  const preB = pre.byKey.get(k) || 0;
+  const postB = post.byKey.get(k) || 0;
+  rows.push({ k, pre: preB, post: postB, delta: postB - preB });
+}
+
+const fmtB = b => {
+  const a = Math.abs(b);
+  if (a >= 1024 * 1024) return (b / 1024 / 1024).toFixed(2) + ' MB';
+  if (a >= 1024) return (b / 1024).toFixed(1) + ' KB';
+  return b + ' B';
+};
+const pad = (s, w) => s.padStart(w);
+
+console.log('pre  samples=' + pre.samples + ', total=' + fmtB(pre.total));
+console.log('post samples=' + post.samples + ', total=' + fmtB(post.total));
+console.log('total delta : ' + fmtB(post.total - pre.total));
+console.log();
+console.log('top 20 by |delta|:');
+console.log('       PRE           POST            Δ        function');
+console.log('   ----------    ----------    ----------    ------------------------');
+rows.sort((a, b) => Math.abs(b.delta) - Math.abs(a.delta));
+for (const r of rows.slice(0, 20)) {
+  const sign = r.delta > 0 ? '+' : '';
+  console.log('   ' + pad(fmtB(r.pre), 10) + '    ' + pad(fmtB(r.post), 10) + '    ' + pad(sign + fmtB(r.delta), 10) + '    ' + r.k);
+}
diff --git a/perf/find-callees.mjs b/perf/find-callees.mjs
new file mode 100644
index 00000000..858425ee
--- /dev/null
+++ b/perf/find-callees.mjs
@@ -0,0 +1,69 @@
+// Companion to find-callers.mjs: shows where a function spends its time
+// across direct callees. Reports self-time + per-callee subtree totals so
+// you can see whether the cost lives in the function body or in what it
+// calls.
+//
+// Usage:
+//   node perf/find-callees.mjs <profile> <calleeName>
+//
+// Example:
+//   node perf/find-callees.mjs results/.../render.cpuprofile removeOverflow
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, targetName] = process.argv.slice(2);
+if (!profilePath || !targetName) {
+  console.error('usage: node find-callees.mjs <profile> <calleeName>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+const usPerSample = (profile.endTime - profile.startTime) / profile.samples.length;
+
+const byId = new Map();
+for (const n of profile.nodes) byId.set(n.id, n);
+
+const subtreeHits = (rootId) => {
+  const stack = [rootId];
+  const seen = new Set();
+  let hits = 0;
+  while (stack.length) {
+    const id = stack.pop();
+    if (seen.has(id)) continue;
+    seen.add(id);
+    const n = byId.get(id);
+    hits += n.hitCount || 0;
+    for (const c of n.children || []) stack.push(c);
+  }
+  return hits;
+};
+
+let selfHits = 0;
+let totalHits = 0;
+const calleeHits = new Map();
+
+for (const n of profile.nodes) {
+  const fn = n.callFrame?.functionName || '';
+  if (fn !== targetName) continue;
+  selfHits += n.hitCount || 0;
+  totalHits += subtreeHits(n.id);
+  for (const cid of n.children || []) {
+    const c = byId.get(cid);
+    const fnC = c.callFrame?.functionName || '(anon)';
+    const url = (c.callFrame?.url || '').replace(/^file:\/\/\//, '');
+    const line = (c.callFrame?.lineNumber ?? -1) + 1;
+    const key = `${fnC}  @  ${url || '(native)'}:${line}`;
+    calleeHits.set(key, (calleeHits.get(key) || 0) + subtreeHits(cid));
+  }
+}
+
+const ms = (hits) => (hits * usPerSample / 1000).toFixed(2);
+
+console.log(`${targetName}: self=${ms(selfHits)}ms, total=${ms(totalHits)}ms (callees combined=${ms(totalHits - selfHits)}ms)`);
+console.log('per direct callee (subtree total ms):');
+[...calleeHits.entries()]
+  .sort((a, b) => b[1] - a[1])
+  .forEach(([k, h]) => {
+    const v = h * usPerSample / 1000;
+    if (v >= 0.5) console.log(`  ${ms(h).padStart(8)} ms   ${k}`);
+  });
diff --git a/perf/grep-profile.mjs b/perf/grep-profile.mjs
new file mode 100644
index 00000000..0b864e43
--- /dev/null
+++ b/perf/grep-profile.mjs
@@ -0,0 +1,37 @@
+// One-off: list every node in a .cpuprofile whose functionName matches
+// the given regex, with self-time and source location. Helpful for
+// "is this frame in the profile at all, and what's it called?"
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, pattern] = process.argv.slice(2);
+if (!profilePath || !pattern) {
+  console.error('usage: node grep-profile.mjs <profile> <regex>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+const usPerSample = (profile.endTime - profile.startTime) / profile.samples.length;
+const re = new RegExp(pattern);
+
+const rows = [];
+for (const n of profile.nodes) {
+  const fn = n.callFrame?.functionName || '';
+  if (!re.test(fn)) continue;
+  const ms = (n.hitCount || 0) * usPerSample / 1000;
+  rows.push({
+    ms,
+    fn,
+    url: (n.callFrame?.url || '').replace(/^file:\/\/\//, '') || '(native)',
+    line: (n.callFrame?.lineNumber ?? -1) + 1,
+    hits: n.hitCount || 0,
+  });
+}
+rows.sort((a, b) => b.ms - a.ms);
+
+let total = 0;
+for (const r of rows) {
+  total += r.ms;
+  console.log(`  ${r.ms.toFixed(2).padStart(8)} ms   ${r.fn}  @  ${r.url}:${r.line}  hits=${r.hits}`);
+}
+console.log(`  -------- ${total.toFixed(2)} ms total across ${rows.length} matching nodes`);
diff --git a/perf/instrument-clones.js b/perf/instrument-clones.js
new file mode 100644
index 00000000..7358d47c
--- /dev/null
+++ b/perf/instrument-clones.js
@@ -0,0 +1,128 @@
+// One-off probe: count how many Layout.append clones survive into the
+// finalized page wrapper vs. how many get rolled back by removeOverflow.
+//
+// Mechanism:
+//   - Wrap Layout.prototype.append to (a) count calls and (b) tag every
+//     returned clone with an expando __pagedjs_clone_tag = true.
+//   - Wrap Node.prototype.cloneNode globally so we can also report the
+//     gross cloneNode call count (which includes rebuildAncestors and
+//     anything else outside Layout.append).
+//   - At finalizePage, walk the just-finalized page wrapper counting
+//     tagged survivors. (removeOverflow has already fired by this point.)
+//   - At afterRendered, summarise totals + per-page distribution.
+//
+// Cost: O(1) per append + one tree walk per finalized page. Run with
+//   --detach-pages --no-timing --additional-script ..\perf\instrument-clones.js
+// from a measure.mjs invocation. Numbers are reported via console.log
+// which measure.mjs forwards to stdout.
+
+(() => {
+    const Layout = window.PagedLayout;
+    if (!Layout) {
+        console.log('[clone-count] ERROR: window.PagedLayout not exposed; bundle patch missing.');
+        return;
+    }
+    const origAppend = Layout.prototype.append;
+    let appendCalls = 0;
+    Layout.prototype.append = function (...args) {
+        const clone = origAppend.apply(this, args);
+        appendCalls++;
+        if (clone) clone.__pagedjs_clone_tag = true;
+        return clone;
+    };
+
+    const origCloneNode = Node.prototype.cloneNode;
+    let cloneNodeCalls = 0;
+    Node.prototype.cloneNode = function (deep) {
+        cloneNodeCalls++;
+        return origCloneNode.call(this, deep);
+    };
+
+    const perPage = []; // { appended, kept }
+    let appendAtPageStart = 0;
+
+    class CloneCountHandler extends Paged.Handler {
+        beforePageLayout() {
+            appendAtPageStart = appendCalls;
+        }
+        finalizePage(pageElement) {
+            const appendedThisPage = appendCalls - appendAtPageStart;
+            let kept = 0;
+            const walker = document.createTreeWalker(
+                pageElement,
+                NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
+            );
+            let n;
+            while ((n = walker.nextNode())) {
+                if (n.__pagedjs_clone_tag) kept++;
+            }
+            perPage.push({ appended: appendedThisPage, kept });
+        }
+        afterRendered(pages) {
+            let totalAppended = 0;
+            let totalKept = 0;
+            let pagesWithOvershoot = 0;
+            let maxOvershoot = 0;
+            let maxOvershootPage = -1;
+            const pcts = [];
+            perPage.forEach((entry, idx) => {
+                totalAppended += entry.appended;
+                totalKept += entry.kept;
+                const over = entry.appended - entry.kept;
+                if (over > 0) pagesWithOvershoot++;
+                if (over > maxOvershoot) {
+                    maxOvershoot = over;
+                    maxOvershootPage = idx;
+                }
+                pcts.push(entry.appended > 0 ? (over / entry.appended) * 100 : 0);
+            });
+            const totalOvershoot = totalAppended - totalKept;
+            const pct = totalAppended > 0
+                ? (totalOvershoot / totalAppended) * 100
+                : 0;
+
+            console.log(`[clone-count] pages=${pages.length}`);
+            console.log(`[clone-count] Layout.append calls (source-walker leaf clones): ${totalAppended}`);
+            console.log(`[clone-count] survivors in finalized pages: ${totalKept}`);
+            console.log(`[clone-count] overshoot (appended-then-removed): ${totalOvershoot} (${pct.toFixed(1)}%)`);
+            console.log(`[clone-count] pages with any overshoot: ${pagesWithOvershoot}/${pages.length}`);
+            console.log(`[clone-count] max overshoot on one page: ${maxOvershoot} (page index ${maxOvershootPage}, appended=${perPage[maxOvershootPage]?.appended ?? 0})`);
+            console.log(`[clone-count] gross Node.cloneNode calls (incl. rebuildAncestors, handlers, etc.): ${cloneNodeCalls}`);
+            console.log(`[clone-count] non-Layout.append clones: ${cloneNodeCalls - totalAppended}`);
+
+            // Per-page overshoot % buckets.
+            const buckets = [
+                { lo: 0,   hi: 1   },
+                { lo: 1,   hi: 5   },
+                { lo: 5,   hi: 10  },
+                { lo: 10,  hi: 20  },
+                { lo: 20,  hi: 30  },
+                { lo: 30,  hi: 50  },
+                { lo: 50,  hi: 101 },
+            ];
+            const counts = buckets.map(() => 0);
+            for (const p of pcts) {
+                for (let i = 0; i < buckets.length; i++) {
+                    if (p >= buckets[i].lo && p < buckets[i].hi) {
+                        counts[i]++;
+                        break;
+                    }
+                }
+            }
+            console.log(`[clone-count] per-page overshoot % distribution:`);
+            for (let i = 0; i < buckets.length; i++) {
+                const b = buckets[i];
+                const hi = b.hi === 101 ? '100' : String(b.hi);
+                console.log(`[clone-count]   ${String(b.lo).padStart(3)} - ${hi.padStart(3)}%: ${counts[i]} pages`);
+            }
+
+            // Cumulative percentile cutpoints.
+            const sortedPcts = pcts.slice().sort((a, b) => a - b);
+            const pickPct = (q) => sortedPcts[Math.min(sortedPcts.length - 1, Math.floor(q * sortedPcts.length))];
+            console.log(`[clone-count] per-page overshoot %: p50=${pickPct(0.5).toFixed(1)}% p90=${pickPct(0.9).toFixed(1)}% p99=${pickPct(0.99).toFixed(1)}% max=${pickPct(0.999).toFixed(1)}%`);
+        }
+    }
+
+    Paged.registerHandlers(CloneCountHandler);
+    console.log('[clone-count] handler registered');
+})();
diff --git a/perf/instrument-detach.js b/perf/instrument-detach.js
new file mode 100644
index 00000000..73afa9d6
--- /dev/null
+++ b/perf/instrument-detach.js
@@ -0,0 +1,97 @@
+// Per-call timing for the detach-pages.js removeChild path.
+//
+// Wraps detach-pages.js's removeChild call so we can see whether the
+// cost is roughly flat per call (some Blink-internal fixed overhead) or
+// scales with the page's descendant count (LayoutObject teardown).
+//
+// Loaded as an --additional-script AFTER detach-pages.js so the
+// instrumentation can monkey-patch the prototype that detach-pages.js
+// uses. Records per-call ns + descendant count + first-quarter /
+// last-quarter buckets. Dump at afterRendered through the [instrument]
+// prefix so the harness pipes it to stdout.
+
+(() => {
+  const origRemoveChild = Node.prototype.removeChild;
+  const samples = [];   // { ns, descendants, isPage }
+  let totalNs = 0;
+  let pageDetachCount = 0;
+  let otherCount = 0;
+
+  Node.prototype.removeChild = function (child) {
+    // count descendants quickly — only meaningful on Element children
+    let descendants = 0;
+    let isPage = false;
+    if (child && child.nodeType === 1) {
+      // Element.children.length is just direct kids; we want a count
+      // estimate of the subtree, but a full walk would skew the timing.
+      // Use childElementCount as a cheap proxy plus a textContent length
+      // bucket so we can correlate with size.
+      descendants = child.getElementsByTagName ? child.getElementsByTagName('*').length : 0;
+      isPage = child.classList && child.classList.contains('pagedjs_page');
+    }
+    const t0 = performance.now();
+    const r = origRemoveChild.call(this, child);
+    const ns = (performance.now() - t0) * 1e6;
+    totalNs += ns;
+    if (isPage) {
+      pageDetachCount++;
+      samples.push({ ns, descendants });
+    } else {
+      otherCount++;
+    }
+    return r;
+  };
+
+  class DetachInstrument extends Paged.Handler {
+    afterRendered(pages) {
+      const total = pages.length;
+      const pageSamples = samples.slice();
+      pageSamples.sort((a, b) => a.ns - b.ns);
+      const median = pageSamples.length ? pageSamples[Math.floor(pageSamples.length / 2)].ns : 0;
+      const p90 = pageSamples.length ? pageSamples[Math.floor(pageSamples.length * 0.9)].ns : 0;
+      const p99 = pageSamples.length ? pageSamples[Math.floor(pageSamples.length * 0.99)].ns : 0;
+      const sumDesc = pageSamples.reduce((s, x) => s + x.descendants, 0);
+      const sumNs = pageSamples.reduce((s, x) => s + x.ns, 0);
+
+      console.log(`[instrument] removeChild wrapper: ${pageDetachCount} page detaches, ${otherCount} other`);
+      console.log(`[instrument] total removeChild wall: ${(totalNs / 1e6).toFixed(1)} ms`);
+      console.log(`[instrument] page-detach total:     ${(sumNs / 1e6).toFixed(1)} ms`);
+      console.log(`[instrument] page-detach avg:       ${(sumNs / pageDetachCount / 1e6).toFixed(3)} ms/call`);
+      console.log(`[instrument] page-detach median:    ${(median / 1e6).toFixed(3)} ms/call`);
+      console.log(`[instrument] page-detach p90:       ${(p90 / 1e6).toFixed(3)} ms/call`);
+      console.log(`[instrument] page-detach p99:       ${(p99 / 1e6).toFixed(3)} ms/call`);
+      console.log(`[instrument] avg descendants/page:  ${(sumDesc / pageDetachCount).toFixed(1)}`);
+
+      // Bucket by descendant count to see proportionality.
+      const buckets = [
+        { lo: 0,   hi: 100,   n: 0, ns: 0, desc: 0 },
+        { lo: 100, hi: 200,   n: 0, ns: 0, desc: 0 },
+        { lo: 200, hi: 400,   n: 0, ns: 0, desc: 0 },
+        { lo: 400, hi: 800,   n: 0, ns: 0, desc: 0 },
+        { lo: 800, hi: 1600,  n: 0, ns: 0, desc: 0 },
+        { lo: 1600,hi: Infinity, n: 0, ns: 0, desc: 0 },
+      ];
+      for (const s of pageSamples) {
+        const b = buckets.find(bk => s.descendants >= bk.lo && s.descendants < bk.hi);
+        if (b) { b.n++; b.ns += s.ns; b.desc += s.descendants; }
+      }
+      console.log(`[instrument] removeChild cost by descendant-count bucket:`);
+      console.log(`[instrument]   desc-range  count  total_ms  avg_ms  avg_desc  ms_per_desc`);
+      for (const b of buckets) {
+        if (!b.n) continue;
+        const avgMs = b.ns / b.n / 1e6;
+        const avgDesc = b.desc / b.n;
+        const msPerDesc = avgDesc > 0 ? (avgMs / avgDesc) * 1000 : 0;
+        const range = b.hi === Infinity ? `${b.lo}+` : `${b.lo}-${b.hi}`;
+        console.log(
+          `[instrument]   ${range.padEnd(10)} ${String(b.n).padStart(6)} ${avgMs.toFixed(3).padStart(8)} ${(b.ns/1e6).toFixed(1).padStart(8)} ${avgDesc.toFixed(0).padStart(10)} ${msPerDesc.toFixed(2).padStart(13)}`
+        );
+      }
+
+      // Restore so afterRendered's own removeChild (when re-appending) isn't double-charged.
+      Node.prototype.removeChild = origRemoveChild;
+    }
+  }
+  Paged.registerHandlers(DetachInstrument);
+  console.log('[instrument] removeChild wrapper installed');
+})();
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 94eedcef..c9df9cf6 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -22,8 +22,22 @@
 // Usage:
 //   node measure.mjs [path/to/book.html] [--out <dir>] [--keep-open]
 //                    [--cpu-profile] [--cpu-sampling <microseconds>]
+//                    [--heap-profile] [--heap-sampling <bytes>]
 //                    [--detach-pages] [--instrument] [--time-hooks]
-//                    [--incremental] [--chrome-outline]
+//                    [--incremental] [--chrome-outline] [--no-timing]
+//                    [--clone-count] [--render-only]
+//
+// --render-only bails out after the render phase. Skips meta extraction,
+// parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
+// Useful for cpu-profile / instrumentation runs where only the render
+// phase matters; trims ~45s off the full ~55s book run. No book.pdf is
+// written, and the timing.json / summary.txt omit generate/process.
+//
+// --no-timing skips the per-page timing-handler.js injection. The handler
+// adds a per-page console.log relayed via CDP that costs ~2% of render
+// self-time on the 1638-page book. Use when profiling for the cleanest
+// possible bottom-up table; loses the per-page CSV and the first/last
+// quartile summary in return.
 //
 // --detach-pages also injects detach-pages.js -- a Paged.Handler that
 // hides each completed page from the layout tree -- to test whether
@@ -75,22 +89,32 @@ let outArg = null;
 let keepOpen = false;
 let cpuProfile = false;
 let cpuSampling = 1000; // microseconds
+let heapProfile = false;
+let heapSampling = 32768; // bytes between samples (CDP default)
 let detachPages = false;
 let instrument = false;
 let timeHooks = false;
 let incremental = false;
 let chromeOutline = false;
+let noTiming = false;
+let cloneCount = false;
+let renderOnly = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
   else if (a === '--keep-open') keepOpen = true;
   else if (a === '--cpu-profile') cpuProfile = true;
   else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10);
+  else if (a === '--heap-profile') heapProfile = true;
+  else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10);
   else if (a === '--detach-pages') detachPages = true;
   else if (a === '--instrument') instrument = true;
   else if (a === '--time-hooks') timeHooks = true;
   else if (a === '--incremental') incremental = true;
   else if (a === '--chrome-outline') chromeOutline = true;
+  else if (a === '--no-timing') noTiming = true;
+  else if (a === '--clone-count') cloneCount = true;
+  else if (a === '--render-only') renderOnly = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -110,10 +134,13 @@ const handlerPath      = resolve(__dirname, 'timing-handler.js');
 const detachPagesPath  = resolve(__dirname, 'detach-pages.js');
 const instrumentPath   = resolve(__dirname, 'instrument-flush-ops.js');
 const timeHooksPath    = resolve(__dirname, 'time-hooks.js');
-const required = [pagedScriptPath, handlerPath];
+const cloneCountPath   = resolve(__dirname, 'instrument-clones.js');
+const required = [pagedScriptPath];
+if (!noTiming)  required.push(handlerPath);
 if (detachPages) required.push(detachPagesPath);
 if (instrument)  required.push(instrumentPath);
 if (timeHooks)   required.push(timeHooksPath);
+if (cloneCount)  required.push(cloneCountPath);
 for (const p of required) {
   if (!existsSync(p)) {
     console.error(`missing required file: ${p}`);
@@ -160,7 +187,8 @@ try {
   page.on('console', (msg) => {
     const t = msg.text();
     if (t.startsWith('[paged-timing]') || t.startsWith('[detach-pages]') ||
-        t.startsWith('[instrument]') || t.startsWith('  ')) {
+        t.startsWith('[instrument]') || t.startsWith('[clone-count]') ||
+        t.startsWith('  ')) {
       console.log(t);
     }
   });
@@ -183,7 +211,9 @@ try {
   });
 
   await page.addScriptTag({ path: pagedScriptPath });
-  await page.addScriptTag({ path: handlerPath });
+  if (!noTiming) {
+    await page.addScriptTag({ path: handlerPath });
+  }
   if (detachPages) {
     await page.addScriptTag({ path: detachPagesPath });
   }
@@ -193,19 +223,30 @@ try {
   if (timeHooks) {
     await page.addScriptTag({ path: timeHooksPath });
   }
+  if (cloneCount) {
+    await page.addScriptTag({ path: cloneCountPath });
+  }
 
   // RENDER ----------------------------------------------------------
-  // Optionally wrap just this phase in a V8 CPU profile. CDP Profiler
-  // attaches to the renderer for this page; we stop before the generate
-  // phase so the trace stays focused on paged.js layout work.
+  // Optionally wrap just this phase in a V8 CPU and/or heap sampling
+  // profile. CDP attaches to the renderer for this page; we stop
+  // before the generate phase so the traces stay focused on paged.js
+  // layout work.
   let cdp = null;
-  if (cpuProfile) {
+  if (cpuProfile || heapProfile) {
     cdp = await page.createCDPSession();
+  }
+  if (cpuProfile) {
     await cdp.send('Profiler.enable');
     await cdp.send('Profiler.setSamplingInterval', { interval: cpuSampling });
     await cdp.send('Profiler.start');
     console.log(`[harness] cpu profile: sampling every ${cpuSampling}us`);
   }
+  if (heapProfile) {
+    await cdp.send('HeapProfiler.enable');
+    await cdp.send('HeapProfiler.startSampling', { samplingInterval: heapSampling });
+    console.log(`[harness] heap profile: sampling every ${heapSampling} bytes`);
+  }
 
   const tRenderStart = Date.now();
   await page.evaluate(async () => {
@@ -226,17 +267,39 @@ try {
   const renderMs = tRenderEnd - tRenderStart;
 
   let profilePath = null;
+  let heapProfilePath = null;
   if (cdp) {
-    const { profile } = await cdp.send('Profiler.stop');
+    if (cpuProfile) {
+      const { profile } = await cdp.send('Profiler.stop');
+      profilePath = join(outDir, 'render.cpuprofile');
+      const profileJson = JSON.stringify(profile);
+      writeFileSync(profilePath, profileJson);
+      console.log(`[harness] cpu profile: ${profilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
+    }
+    if (heapProfile) {
+      const { profile } = await cdp.send('HeapProfiler.stopSampling');
+      heapProfilePath = join(outDir, 'render.heapprofile');
+      const profileJson = JSON.stringify(profile);
+      writeFileSync(heapProfilePath, profileJson);
+      const totalBytes = profile.samples.reduce((s, x) => s + x.size, 0);
+      console.log(`[harness] heap profile: ${heapProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB, ${profile.samples.length} samples, ${(totalBytes / 1024 / 1024).toFixed(1)} MB allocated)`);
+    }
     await cdp.detach();
-    profilePath = join(outDir, 'render.cpuprofile');
-    const profileJson = JSON.stringify(profile);
-    writeFileSync(profilePath, profileJson);
-    console.log(`[harness] cpu profile: ${profilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
   }
 
   console.log(`[harness] render   ${fmtMs(renderMs)}`);
 
+  // Declared outside the generate/process blocks so the persistence /
+  // summary code can read them either way. --render-only leaves them null.
+  let generateMs = null;
+  let parseOutlineMs = null;
+  let pdfMs = null;
+  let rawPdfBytes = null;
+  let processMs = null;
+  let processBreakdown = null;
+  let finalPdf = null;
+
+  if (!renderOnly) {
   // GENERATE --------------------------------------------------------
   // meta extraction + outline DOM walk + Chromium DOM->PDF.
   const tGenStart = Date.now();
@@ -258,7 +321,7 @@ try {
   // would get overwritten by Chrome's /Outlines anyway.
   const tParseOutlineStart = Date.now();
   const outline = chromeOutline ? [] : await parseOutline(page, outlineTags);
-  const parseOutlineMs = Date.now() - tParseOutlineStart;
+  parseOutlineMs = Date.now() - tParseOutlineStart;
 
   const tPdfStart = Date.now();
   const rawPdf = await page.pdf({
@@ -274,10 +337,11 @@ try {
     // both on would have our setOutline overwrite Chrome's /Outlines.
     ...(chromeOutline ? { outline: true, tagged: true } : {}),
   });
-  const pdfMs = Date.now() - tPdfStart;
+  pdfMs = Date.now() - tPdfStart;
+  rawPdfBytes = rawPdf.length;
 
   const tGenEnd = Date.now();
-  const generateMs = tGenEnd - tGenStart;
+  generateMs = tGenEnd - tGenStart;
   console.log(`[harness] generate ${fmtMs(generateMs)}  (parseOutline=${fmtMs(parseOutlineMs)}, page.pdf=${fmtMs(pdfMs)}, ${(rawPdf.length / 1024 / 1024).toFixed(1)}MB)`);
 
   // PROCESS ---------------------------------------------------------
@@ -294,8 +358,6 @@ try {
   // Either way we time the full phase plus the meaningful sub-steps so the
   // breakdown matches across runs.
   const tProcStart = Date.now();
-  let finalPdf;
-  let processBreakdown;
   if (incremental) {
     const tIncStart = Date.now();
     const { bytes, stats } = await applyOutlineAndMetadataIncremental(rawPdf, outline, meta);
@@ -327,25 +389,30 @@ try {
     processBreakdown = { loadMs, setOutlineMs, saveMs };
   }
   const tProcEnd  = Date.now();
-  const processMs = tProcEnd - tProcStart;
+  processMs = tProcEnd - tProcStart;
   if (incremental) {
     console.log(`[harness] process  ${fmtMs(processMs)}  (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`);
   } else {
     console.log(`[harness] process  ${fmtMs(processMs)}  (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)})`);
   }
+  }  // end if (!renderOnly)
 
-  const totalMs = tProcEnd - tRenderStart;
+  const totalMs = Date.now() - tRenderStart;
   console.log(`[harness] total    ${fmtMs(totalMs)}`);
 
   // Persist results -------------------------------------------------
-  const timing = await page.evaluate(() => window.__pagedTiming);
-  const pdfPath = join(outDir, 'book.pdf');
-  writeFileSync(pdfPath, Buffer.from(finalPdf));
+  const timing = noTiming
+    ? { pages: [], phases: {}, pageCount: null }
+    : await page.evaluate(() => window.__pagedTiming);
+  if (finalPdf) {
+    const pdfPath = join(outDir, 'book.pdf');
+    writeFileSync(pdfPath, Buffer.from(finalPdf));
+  }
 
   const record = {
     input: inputPath,
     pageCount: timing.pageCount,
-    pdfBytes: finalPdf.length,
+    pdfBytes: finalPdf ? finalPdf.length : null,
     cpuProfile: profilePath,
     phases: {
       render: {
@@ -353,20 +420,22 @@ try {
         perPage: timing.pages,
         phaseMarks: timing.phases,
       },
-      generate: {
-        ms: generateMs,
-        parseOutlineMs,
-        pagePdfMs: pdfMs,
-        rawPdfBytes: rawPdf.length,
-      },
-      process: {
-        ms: processMs,
-        mode: incremental ? 'incremental' : 'pdf-lib-roundtrip',
-        ...processBreakdown,
-      },
     },
     totalMs,
   };
+  if (!renderOnly) {
+    record.phases.generate = {
+      ms: generateMs,
+      parseOutlineMs,
+      pagePdfMs: pdfMs,
+      rawPdfBytes,
+    };
+    record.phases.process = {
+      ms: processMs,
+      mode: incremental ? 'incremental' : 'pdf-lib-roundtrip',
+      ...processBreakdown,
+    };
+  }
   writeFileSync(join(outDir, 'timing.json'), JSON.stringify(record, null, 2));
 
   const csv = ['page,dur_ms,heap_start_mb,heap_end_mb,elapsed_s'];
@@ -385,11 +454,17 @@ try {
   const summary = [];
   summary.push(`input        : ${inputPath}`);
   summary.push(`pages        : ${pages.length}`);
-  summary.push(`pdf size     : ${(finalPdf.length / 1024 / 1024).toFixed(1)} MB`);
+  if (finalPdf) {
+    summary.push(`pdf size     : ${(finalPdf.length / 1024 / 1024).toFixed(1)} MB`);
+  }
   summary.push('');
   summary.push(`render       : ${fmtMs(renderMs)}    (per-page layout via paged.js)`);
-  summary.push(`generate     : ${fmtMs(generateMs)}    (parseOutline + page.pdf)`);
-  summary.push(`process      : ${fmtMs(processMs)}    (${incremental ? 'incremental update (append outline + updated catalog/info)' : 'pdf-lib load + setOutline + save'})`);
+  if (!renderOnly) {
+    summary.push(`generate     : ${fmtMs(generateMs)}    (parseOutline + page.pdf)`);
+    summary.push(`process      : ${fmtMs(processMs)}    (${incremental ? 'incremental update (append outline + updated catalog/info)' : 'pdf-lib load + setOutline + save'})`);
+  } else {
+    summary.push(`(generate + process skipped: --render-only)`);
+  }
   summary.push(`total        : ${fmtMs(totalMs)}`);
   summary.push('');
   if (pages.length >= 4) {