Skip to content

Commit efca24f

Browse files
authored
Merge pull request #148 from KubaO/staging
Make the PDF build time bearable.
2 parents 0402d9f + 5956723 commit efca24f

29 files changed

Lines changed: 39812 additions & 2022 deletions

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,8 @@
22
CLAUDE.md
33
/.htmltest.yml
44
node_modules/
5+
6+
# perf/measure.mjs --out targets at repo root (book.pdf + render.cpuprofile + timing.*)
7+
/before/
8+
/after-*/
9+
/findoverflow-baseline/

docs/assets/css/print.css

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,14 @@
1313
margin: 22mm 20mm 22mm 20mm;
1414

1515
@bottom-right {
16-
content: counter(page);
16+
/* Reads a JS-tracked page number set on each .pagedjs_page wrapper
17+
by the Counters handler in docs/lib/paged.browser.js. Switched off
18+
`counter(page)` because the aggressive-detach render optimization
19+
(perf/detach-pages.js) physically removes finalized pages from the
20+
DOM, which breaks CSS counter accumulation. The Counters handler
21+
honours the same part-divider counter-reset rules as the original
22+
counter(page) did, so part-restarts continue to work. */
23+
content: var(--page-num);
1724
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
1825
font-size: 9pt;
1926
color: #555;

docs/book.bat

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,27 @@
22
rem PDF render only. Run build.bat (or `bundle exec jekyll build`) first
33
rem so _site-pdf\book.html and its dependencies exist; this script
44
rem assumes the Pdfify plugin has already populated _site-pdf\.
5+
rem
6+
rem render-book.mjs drives puppeteer + paged.js + pdf-lib directly so
7+
rem we control pdf-lib's parseSpeed (the default yields the event loop
8+
rem between every 100 objects on load, adding ~32 s to a 100 s build
9+
rem for no reason in Node -- see perf\README.md "Profiling pdf-lib's
10+
rem load" for the full diagnosis). pagedjs-cli passed no options to
11+
rem load/save and inherited that cost; we don't.
12+
rem
13+
rem --additional-script ..\perf\detach-pages.js injects a Paged.Handler
14+
rem that hides each finalised page from Chromium's layout tree and
15+
rem restores them all before page.pdf() runs. Drops total render from
16+
rem ~104s to ~51s on the 1638-page book by eliminating the O(n^2)
17+
rem getBoundingClientRect cost in paged.js's overflow walker.
518
if not exist _site-pdf\book.html (
619
echo _site-pdf\book.html not found. Run build.bat first.
720
exit /b 1
821
)
22+
if not exist node_modules\puppeteer\package.json (
23+
echo Installing docs\ dependencies...
24+
call npm install
25+
if errorlevel 1 exit /b 1
26+
)
927
if not exist _pdf mkdir _pdf
10-
npx pagedjs-cli _site-pdf\book.html -o _pdf\book.pdf --outline-tags h1,h2,h3,h4 -t 600000
28+
node render-book.mjs _site-pdf\book.html -o _pdf\book.pdf --outline-tags h1,h2,h3,h4 --additional-script ..\perf\detach-pages.js

docs/lib/outline.mjs

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// Adapted verbatim from pagedjs-cli 0.4.3 src/outline.js
2+
// (https://github.com/pagedjs/pagedjs-cli) -- MIT, Copyright (c) 2018
3+
// Adam Hyde. Pulled in directly so we no longer need the pagedjs-cli
4+
// dependency.
5+
//
6+
// Two exports:
7+
// parseOutline(page, tags, enableWarnings) -- runs in the browser
8+
// via page.evaluate. Walks document.querySelectorAll(tags.join(','))
9+
// to produce a nested outline tree of {title, destination, children}.
10+
// Also creates a hidden <a href="#id"> link-holder so Chrome
11+
// registers a named destination for each heading -- without that,
12+
// the named-destination Dest entries we write in setOutline would
13+
// point nowhere.
14+
//
15+
// setOutline(pdfDoc, outline, enableWarnings) -- runs in Node on the
16+
// parsed pdf-lib document. Walks the outline tree and writes a
17+
// /Outlines tree of PDF dicts using pdf-lib's low-level API
18+
// (PDFDict.fromMapWithContext, etc.). Each entry's Dest is a name
19+
// that Chrome's /Dests catalog entry resolves to a page+coords.
20+
21+
import { PDFDict, PDFName, PDFNumber, PDFHexString } from "pdf-lib";
22+
import { decode as htmlEntitiesDecode } from "html-entities";
23+
24+
const SanitizeXMLRx = /<[^>]+>/g;
25+
26+
function sanitize (string) {
27+
if (string.includes("<")) {
28+
string = string.replace(SanitizeXMLRx, "");
29+
}
30+
return htmlEntitiesDecode(string);
31+
}
32+
33+
export async function parseOutline(page, tags, enableWarnings) {
34+
return await page.evaluate((tags) => {
35+
const tagsToProcess = [];
36+
for (const node of document.querySelectorAll(tags.join(","))) {
37+
tagsToProcess.push(node);
38+
}
39+
tagsToProcess.reverse();
40+
41+
const root = {children: [], depth: -1};
42+
let currentOutlineNode = root;
43+
44+
const linkHolder = document.createElement("div");
45+
const body = document.querySelector("body");
46+
linkHolder.style.display = "none";
47+
body.insertBefore(linkHolder, body.firstChild);
48+
49+
while (tagsToProcess.length > 0) {
50+
const tag = tagsToProcess.pop();
51+
const orderDepth = tags.indexOf(tag.tagName.toLowerCase());
52+
const dest = encodeURIComponent(tag.id).replace(/%/g, "#25");
53+
54+
// Add to link holder to register a destination
55+
const hiddenLink = document.createElement("a");
56+
hiddenLink.href = "#"+dest;
57+
linkHolder.appendChild(hiddenLink);
58+
59+
if (orderDepth < currentOutlineNode.depth) {
60+
currentOutlineNode = currentOutlineNode.parent;
61+
tagsToProcess.push(tag);
62+
} else {
63+
const newNode = {
64+
title: tag.innerText.trim(),
65+
// encode section ID until https://bugs.chromium.org/p/chromium/issues/detail?id=985254 is fixed
66+
destination: dest,
67+
children: [],
68+
depth: orderDepth,
69+
};
70+
if (orderDepth == currentOutlineNode.depth) {
71+
if (currentOutlineNode.parent) {
72+
newNode.parent = currentOutlineNode.parent;
73+
currentOutlineNode.parent.children.push(newNode);
74+
} else {
75+
newNode.parent = currentOutlineNode;
76+
currentOutlineNode.children.push(newNode);
77+
}
78+
currentOutlineNode = newNode;
79+
} else if (orderDepth > currentOutlineNode.depth) {
80+
newNode.parent = currentOutlineNode;
81+
currentOutlineNode.children.push(newNode);
82+
currentOutlineNode = newNode;
83+
}
84+
}
85+
}
86+
87+
const stripParentProperty = (node) => {
88+
node.parent = undefined;
89+
for (const child of node.children) {
90+
stripParentProperty(child);
91+
}
92+
};
93+
stripParentProperty(root);
94+
return root.children;
95+
}, tags);
96+
}
97+
98+
function setRefsForOutlineItems (layer, context, parentRef) {
99+
for (const item of layer) {
100+
item.ref = context.nextRef();
101+
item.parentRef = parentRef;
102+
setRefsForOutlineItems(item.children, context, item.ref);
103+
}
104+
}
105+
106+
function countChildrenOfOutline (layer) {
107+
let count = 0;
108+
for (const item of layer) {
109+
++count;
110+
count += countChildrenOfOutline(item.children);
111+
}
112+
return count;
113+
}
114+
115+
function buildPdfObjectsForOutline (layer, context) {
116+
for (const [i, item] of layer.entries()) {
117+
const prev = layer[i - 1];
118+
const next = layer[i + 1];
119+
120+
const pdfObject = new Map([
121+
[PDFName.of("Title"), PDFHexString.fromText(sanitize(item.title))],
122+
[PDFName.of("Dest"), PDFName.of(item.destination)],
123+
[PDFName.of("Parent"), item.parentRef]
124+
]);
125+
if (prev) {
126+
pdfObject.set(PDFName.of("Prev"), prev.ref);
127+
}
128+
if (next) {
129+
pdfObject.set(PDFName.of("Next"), next.ref);
130+
}
131+
if (item.children.length > 0) {
132+
pdfObject.set(PDFName.of("First"), item.children[0].ref);
133+
pdfObject.set(PDFName.of("Last"), item.children[item.children.length - 1].ref);
134+
pdfObject.set(PDFName.of("Count"), PDFNumber.of(countChildrenOfOutline(item.children)));
135+
}
136+
137+
context.assign(item.ref, PDFDict.fromMapWithContext(pdfObject, context));
138+
139+
buildPdfObjectsForOutline(item.children, context);
140+
}
141+
}
142+
143+
function generateWarningsAboutMissingDestinations (layer, pdfDoc) {
144+
const dests = pdfDoc.context.lookup(pdfDoc.catalog.get(PDFName.of("Dests")));
145+
// Dests can be undefined if the PDF wasn't successfully generated (for instance if Paged.js threw an exception)
146+
if (dests) {
147+
const validDestinationTargets = dests.entries().map(([key, _]) => key.value());
148+
for (const item of layer) {
149+
if (item.destination && !validDestinationTargets.includes("/" + item.destination)) {
150+
console.warn(`Unable to find destination "${item.destination}" while generating PDF outline.`);
151+
}
152+
generateWarningsAboutMissingDestinations(item.children, pdfDoc);
153+
}
154+
}
155+
}
156+
157+
export async function setOutline (pdfDoc, outline, enableWarnings=false) {
158+
const context = pdfDoc.context;
159+
const outlineRef = context.nextRef();
160+
161+
if (outline.length === 0) {
162+
return pdfDoc;
163+
}
164+
165+
if (enableWarnings) {
166+
generateWarningsAboutMissingDestinations(outline, pdfDoc);
167+
}
168+
169+
setRefsForOutlineItems(outline, context, outlineRef);
170+
buildPdfObjectsForOutline(outline, context);
171+
172+
const outlineObject = PDFDict.fromMapWithContext(new Map([
173+
[PDFName.of("First"), outline[0].ref],
174+
[PDFName.of("Last"), outline[outline.length - 1].ref],
175+
[PDFName.of("Count"), PDFNumber.of(countChildrenOfOutline(outline))]
176+
]), context);
177+
context.assign(outlineRef, outlineObject);
178+
179+
pdfDoc.catalog.set(PDFName.of("Outlines"), outlineRef);
180+
return pdfDoc;
181+
}

0 commit comments

Comments
 (0)