Skip to content

Commit e3f4b40

Browse files
authored
Merge pull request #154 from KubaO/staging
Squeeze performance out of the (post)processing stage of PDF build.
2 parents 53b8f25 + 80e2585 commit e3f4b40

34 files changed

Lines changed: 11147 additions & 29 deletions

docs/lib/fast-array-onebuf.mjs

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
// One-buffer PDFArray: every committed element lives in a single
2+
// append-only JS Array (arrayMain), kept for the document's lifetime.
3+
// Mirror of fast-dict-onebuf's strategy applied to PDFArray. Backing
4+
// is a plain heterogeneous JS Array -- slots hold the original
5+
// PDFObject references directly. No encoding, no decode on read; the
6+
// hot path is `arrayMain[start + i]`.
7+
//
8+
// Phase 3 of fast-dict-encoded did the same range-view refactor on
9+
// PDFArray but used a Float64Array + encoded slots (mirroring its
10+
// dict shape). The encoded backing cost ~300 ms of decodeValue
11+
// dispatch during save (PDFArray.copyBytesInto iterates ~500 k
12+
// elements). This shim keeps the heap win (~19 MB on the book by
13+
// removing each PDFArray's per-instance `this.array = []`) without
14+
// paying the decode cost: slots are JS references, reads are direct.
15+
//
16+
// 40-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
17+
// bits 0-23: start (24 bits, max 16 M slots in arrayMain)
18+
// bits 24-39: length (16 bits, max 65 536 elements; max observed
19+
// ~25 k on the book)
20+
// bits 40-52: spare (13 bits)
21+
//
22+
// Recursion. parseArray pushes elements onto a per-parser _arrayTemp;
23+
// inner parseArray invocations append on top, commit their frame to
24+
// arrayMain in one append, and pop temp back. Inner / outer ranges
25+
// in arrayMain do not overlap. _arrayTemp is independent of
26+
// fast-dict-onebuf's _dictTemp so dict <-> array recursion is fine.
27+
//
28+
// Mutations:
29+
// - set(i, v): in-place replace (safe; no length change)
30+
// - push(v) at HWM: in-place extend (no other arrays follow)
31+
// - push(v) not at HWM: COW the range to tail, then push
32+
// - insert / remove: always COW (shifts would corrupt neighbours)
33+
// Same at-HWM-determines-safety logic as fast-dict-onebuf; no owned
34+
// bit needed (see fast-dict-onebuf commit 7e8b1f7).
35+
//
36+
// Singleton PDFContext (one PDFDocument.load per process in our
37+
// pipeline). The singleton is duplicated rather than shared with
38+
// fast-dict-onebuf -- the mechanism is ten lines and keeping each
39+
// shim independently injectable is worth more than dedup'ing it.
40+
// Both shims end up holding references to the same PDFContext.
41+
//
42+
// Composes with --fast-dict-onebuf. Mutually exclusive with
43+
// --fast-dict-encoded (which subsumes both via its own encoded shape).
44+
45+
import { createRequire } from 'node:module';
46+
47+
const require = createRequire(import.meta.url);
48+
const PDFArray = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
49+
const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
50+
const CharCodes = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
51+
52+
// ---- The single buffer ---------------------------------------------
53+
54+
// Pre-sized to total array slots + slack on the book. Other workloads
55+
// grow it naturally from this starting size. When the measure-pass
56+
// shim runs first, it calls setExpectedArraySlots() before parse,
57+
// which resizes `arrayMain` to exact measured demand via
58+
// `arrayMain.length = N`.
59+
const ARRAY_MAIN_INITIAL_CAP = 800000;
60+
const arrayMain = new Array(ARRAY_MAIN_INITIAL_CAP);
61+
let arrayMainLen = 0;
62+
63+
export { arrayMain };
64+
export function getArrayMainLen() { return arrayMainLen; }
65+
66+
// Resize arrayMain in place. Must be called before any parseArray /
67+
// withContext (i.e. while arrayMainLen is still 0). `slack` is a
68+
// multiplier on `slots`; default 1.0 (exact). Same in-place-resize
69+
// rationale as fast-dict-onebuf's setExpectedDictSlots: reassigning
70+
// the module-level binding invalidates V8's inline-cache slots in
71+
// every closure that reads it, and the deopt + recompile shows up as
72+
// a parse-time allocation spike.
73+
export function setExpectedArraySlots(slots, slack = 1.0) {
74+
if (arrayMainLen > 0) {
75+
throw new Error(
76+
`fast-array-onebuf: setExpectedArraySlots called after parse started (arrayMainLen=${arrayMainLen})`,
77+
);
78+
}
79+
arrayMain.length = Math.ceil(slots * slack);
80+
}
81+
82+
// ---- Bit-packing helpers -------------------------------------------
83+
84+
const POW_24 = 16777216; // 2^24
85+
const MASK_24 = 0xFFFFFF;
86+
const MASK_16 = 0xFFFF;
87+
88+
const MAX_START = POW_24; // exclusive
89+
const MAX_LENGTH = 1 << 16; // 65 536, exclusive
90+
91+
function pack(start, length) {
92+
if (start >= MAX_START) throw new Error(`fast-array-onebuf: start ${start} exceeds 24-bit budget`);
93+
if (length >= MAX_LENGTH) throw new Error(`fast-array-onebuf: length ${length} exceeds 16-bit budget`);
94+
return start + length * POW_24;
95+
}
96+
97+
function _start(d) { return d & MASK_24; }
98+
function _length(d) { return Math.floor(d / POW_24) & MASK_16; }
99+
100+
// ---- Singleton context ---------------------------------------------
101+
102+
let _singletonContext = null;
103+
104+
function _registerContext(ctx) {
105+
if (_singletonContext === null) {
106+
_singletonContext = ctx;
107+
} else if (_singletonContext !== ctx) {
108+
throw new Error('fast-array-onebuf: expected a singleton PDFContext, got a second distinct one.');
109+
}
110+
}
111+
112+
// ---- Append + COW helpers ------------------------------------------
113+
114+
function _appendFromTemp(temp, fromOffset, lenSlots) {
115+
for (let i = 0; i < lenSlots; i++) {
116+
arrayMain[arrayMainLen + i] = temp[fromOffset + i];
117+
}
118+
arrayMainLen += lenSlots;
119+
}
120+
121+
function _appendArray(arr) {
122+
const len = arr.length;
123+
for (let i = 0; i < len; i++) arrayMain[arrayMainLen + i] = arr[i];
124+
arrayMainLen += len;
125+
}
126+
127+
// COW: copy this array's range to arrayMain's tail. If already at
128+
// the HWM, nothing to copy -- return d unchanged.
129+
function _cow(pa) {
130+
const d = pa.d;
131+
const start = _start(d);
132+
const length = _length(d);
133+
if (start + length === arrayMainLen) return d; // at HWM
134+
const newStart = arrayMainLen;
135+
for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
136+
arrayMainLen += length;
137+
return pack(newStart, length);
138+
}
139+
140+
// ---- Construction --------------------------------------------------
141+
//
142+
// Use a plain-function constructor (`_FastArray`) with the prototype
143+
// aliased to PDFArray.prototype instead of `Object.create + writes`.
144+
// Same shape change fast-refs-class and fast-dict-onebuf made: V8
145+
// gives `new`-built instances a stable hidden class from the first
146+
// instance and drops per-instance cost vs the slow-property path
147+
// taken by Object.create + later property writes.
148+
//
149+
// No subclass dispatch needed -- PDFArray has no subclasses in
150+
// pdf-lib (unlike PDFDict's PDFCatalog / PDFPageTree / PDFPageLeaf).
151+
152+
function _FastArray(d) { this.d = d; }
153+
_FastArray.prototype = PDFArray.prototype;
154+
155+
function _makeFromRange(start, length, ctx) {
156+
_registerContext(ctx);
157+
return new _FastArray(pack(start, length));
158+
}
159+
160+
function _makeFromAppend(arr, ctx) {
161+
const start = arrayMainLen;
162+
_appendArray(arr);
163+
return _makeFromRange(start, arr.length, ctx);
164+
}
165+
166+
if (!PDFArray.prototype.__fastArrayOnebufInstalled) {
167+
168+
// ---- PDFArray.prototype -----------------------------------------
169+
170+
PDFArray.prototype.size = function () {
171+
return _length(this.d);
172+
};
173+
174+
PDFArray.prototype.push = function (object) {
175+
const d0 = this.d;
176+
const start0 = _start(d0);
177+
const length0 = _length(d0);
178+
let dNow = d0;
179+
if (start0 + length0 !== arrayMainLen) {
180+
dNow = _cow(this);
181+
}
182+
arrayMain[arrayMainLen++] = object;
183+
const start = _start(dNow);
184+
this.d = pack(start, length0 + 1);
185+
};
186+
187+
PDFArray.prototype.get = function (index) {
188+
return arrayMain[_start(this.d) + index];
189+
};
190+
191+
PDFArray.prototype.set = function (index, object) {
192+
arrayMain[_start(this.d) + index] = object;
193+
};
194+
195+
PDFArray.prototype.indexOf = function (object) {
196+
const d = this.d;
197+
const start = _start(d);
198+
const length = _length(d);
199+
for (let i = 0; i < length; i++) {
200+
if (arrayMain[start + i] === object) return i;
201+
}
202+
return undefined;
203+
};
204+
205+
PDFArray.prototype.insert = function (index, object) {
206+
// Always COW -- shifting elements in place would corrupt other
207+
// arrays' ranges past this one.
208+
const d0 = this.d;
209+
const start0 = _start(d0);
210+
const length0 = _length(d0);
211+
const newStart = arrayMainLen;
212+
for (let i = 0; i < index; i++) {
213+
arrayMain[arrayMainLen++] = arrayMain[start0 + i];
214+
}
215+
arrayMain[arrayMainLen++] = object;
216+
for (let i = index; i < length0; i++) {
217+
arrayMain[arrayMainLen++] = arrayMain[start0 + i];
218+
}
219+
this.d = pack(newStart, length0 + 1);
220+
};
221+
222+
PDFArray.prototype.remove = function (index) {
223+
// Always COW (same reason as insert).
224+
const d0 = this.d;
225+
const start0 = _start(d0);
226+
const length0 = _length(d0);
227+
const newStart = arrayMainLen;
228+
for (let i = 0; i < length0; i++) {
229+
if (i === index) continue;
230+
arrayMain[arrayMainLen++] = arrayMain[start0 + i];
231+
}
232+
this.d = pack(newStart, length0 - 1);
233+
};
234+
235+
PDFArray.prototype.asArray = function () {
236+
const d = this.d;
237+
const start = _start(d);
238+
const length = _length(d);
239+
const out = new Array(length);
240+
for (let i = 0; i < length; i++) out[i] = arrayMain[start + i];
241+
return out;
242+
};
243+
244+
PDFArray.prototype.clone = function (context) {
245+
const d = this.d;
246+
const start = _start(d);
247+
const length = _length(d);
248+
const newStart = arrayMainLen;
249+
for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
250+
arrayMainLen += length;
251+
_registerContext(context || _singletonContext);
252+
return new _FastArray(pack(newStart, length));
253+
};
254+
255+
PDFArray.prototype.toString = function () {
256+
const d = this.d;
257+
const start = _start(d);
258+
const length = _length(d);
259+
let s = '[ ';
260+
for (let i = 0; i < length; i++) s += arrayMain[start + i].toString() + ' ';
261+
return s + ']';
262+
};
263+
264+
PDFArray.prototype.sizeInBytes = function () {
265+
const d = this.d;
266+
const start = _start(d);
267+
const end = start + _length(d);
268+
let size = 3;
269+
for (let i = start; i < end; i++) size += arrayMain[i].sizeInBytes() + 1;
270+
return size;
271+
};
272+
273+
PDFArray.prototype.copyBytesInto = function (buffer, offset) {
274+
const initialOffset = offset;
275+
buffer[offset++] = CharCodes.LeftSquareBracket;
276+
buffer[offset++] = CharCodes.Space;
277+
const d = this.d;
278+
const start = _start(d);
279+
const end = start + _length(d);
280+
for (let i = start; i < end; i++) {
281+
offset += arrayMain[i].copyBytesInto(buffer, offset);
282+
buffer[offset++] = CharCodes.Space;
283+
}
284+
buffer[offset++] = CharCodes.RightSquareBracket;
285+
return offset - initialOffset;
286+
};
287+
288+
// lookup, lookupMaybe, asRectangle, scalePDFNumbers stay on the
289+
// upstream prototype -- they call this.get / this.size / this.set
290+
// and dispatch through our overrides.
291+
292+
Object.defineProperty(PDFArray.prototype, 'context', {
293+
get() { return _singletonContext; },
294+
set(_ctx) { /* singleton is source of truth */ },
295+
configurable: true,
296+
});
297+
298+
// ---- PDFArray factory -------------------------------------------
299+
300+
PDFArray.withContext = function (context) {
301+
return _makeFromAppend([], context);
302+
};
303+
304+
// ---- PDFObjectParser.prototype.parseArray -----------------------
305+
//
306+
// Same temp/commit pattern as fast-dict-onebuf's parseDict:
307+
// each parser instance carries its own _arrayTemp + length cursor;
308+
// parseArray pushes elements onto temp's tail, commits the frame
309+
// to arrayMain in one contiguous append, pops temp back to
310+
// frameStart, returns a PDFArray view into arrayMain.
311+
312+
PDFObjectParser.prototype.parseArray = function fastParseArrayOneBuf() {
313+
const bytes = this.bytes;
314+
bytes.assertNext(CharCodes.LeftSquareBracket);
315+
this.skipWhitespaceAndComments();
316+
317+
if (this._arrayTemp === undefined) {
318+
this._arrayTemp = new Array(64); // grows naturally if needed
319+
this._arrayTempLen = 0;
320+
}
321+
const temp = this._arrayTemp;
322+
const frameStart = this._arrayTempLen;
323+
324+
while (bytes.peek() !== CharCodes.RightSquareBracket) {
325+
const element = this.parseObject(); // may recurse
326+
temp[this._arrayTempLen++] = element;
327+
this.skipWhitespaceAndComments();
328+
}
329+
bytes.assertNext(CharCodes.RightSquareBracket);
330+
331+
const frameLen = this._arrayTempLen - frameStart;
332+
const start = arrayMainLen;
333+
_appendFromTemp(temp, frameStart, frameLen);
334+
this._arrayTempLen = frameStart;
335+
336+
return _makeFromRange(start, frameLen, this.context);
337+
};
338+
339+
PDFArray.prototype.__fastArrayOnebufInstalled = true;
340+
}

0 commit comments

Comments
 (0)