Skip to content

Commit a5e9940

Browse files
authored
Merge pull request #21231 from calixteman/simplify_extract
Simplify '#getFilteredPageIndices' and '#resolveInsertAfterIndices'
2 parents a328294 + b39440b commit a5e9940

2 files changed

Lines changed: 144 additions & 111 deletions

File tree

src/core/editor/pdf_editor.js

Lines changed: 76 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -563,9 +563,10 @@ class PDFEditor {
563563
* @property {Array<number>} [pageIndices]
564564
* position of the pages in the final document.
565565
* @property {number} [insertAfter]
566-
* 0-based index in the base sequential document after which to insert the
567-
* pages. Sequential pageInfos (those without pageIndices) have their indices
568-
* shifted to accommodate the insertion. Cannot be combined with pageIndices.
566+
* 0-based index in the base sequential sequence after which to insert the
567+
* pages. When every contributing pageInfo has pageIndices, this is
568+
* interpreted against that explicit layout. Cannot be combined with
569+
* pageIndices on the same entry.
569570
*/
570571

571572
/**
@@ -578,81 +579,58 @@ class PDFEditor {
578579
if (!document) {
579580
return [];
580581
}
581-
let keptIndices, keptRanges, deletedIndices, deletedRanges;
582-
for (const page of includePages || []) {
583-
if (Array.isArray(page)) {
584-
(keptRanges ||= []).push(page);
585-
} else {
586-
(keptIndices ||= new Set()).add(page);
582+
const compile = list => {
583+
if (!list?.length) {
584+
return null;
587585
}
588-
}
589-
for (const page of excludePages || []) {
590-
if (Array.isArray(page)) {
591-
(deletedRanges ||= []).push(page);
592-
} else {
593-
(deletedIndices ||= new Set()).add(page);
586+
const indices = new Set();
587+
const ranges = [];
588+
for (const item of list) {
589+
if (Array.isArray(item)) {
590+
ranges.push(item);
591+
} else {
592+
indices.add(item);
593+
}
594594
}
595-
}
596-
const indices = [];
595+
return { indices, ranges };
596+
};
597+
const matches = (index, { indices, ranges }) =>
598+
indices.has(index) ||
599+
ranges.some(([start, end]) => index >= start && index <= end);
600+
const inc = compile(includePages);
601+
const exc = compile(excludePages);
602+
const result = [];
597603
for (let i = 0, ii = document.numPages; i < ii; i++) {
598-
if (deletedIndices?.has(i)) {
604+
if (exc && matches(i, exc)) {
599605
continue;
600606
}
601-
if (deletedRanges) {
602-
let isDeleted = false;
603-
for (const [start, end] of deletedRanges) {
604-
if (i >= start && i <= end) {
605-
isDeleted = true;
606-
break;
607-
}
608-
}
609-
if (isDeleted) {
610-
continue;
611-
}
612-
}
613-
let takePage = false;
614-
if (keptIndices) {
615-
takePage = keptIndices.has(i);
616-
}
617-
if (!takePage && keptRanges) {
618-
for (const [start, end] of keptRanges) {
619-
if (i >= start && i <= end) {
620-
takePage = true;
621-
break;
622-
}
623-
}
624-
}
625-
if (!takePage && !keptIndices && !keptRanges) {
626-
takePage = true;
627-
}
628-
if (takePage) {
629-
indices.push(i);
607+
if (!inc || matches(i, inc)) {
608+
result.push(i);
630609
}
631610
}
632-
return indices;
611+
return result;
633612
}
634613

635614
/**
636615
* Resolve insertAfter pageInfos by converting them (and sequential pageInfos)
637616
* to explicit pageIndices, shifting indices to accommodate each insertion.
638-
* insertAfter values are relative to the base sequential sequence (i.e. the
639-
* concatenation of pages from pageInfos that have neither pageIndices nor
640-
* insertAfter), so they are independent of each other.
641617
* @param {Array<PageInfo>} pageInfos
642618
* @returns {Array<PageInfo>}
643619
*/
644620
#resolveInsertAfterIndices(pageInfos) {
645-
// Single pass: build the base sequential sequence and collect insertAfter
646-
// entries, computing each pageInfo's filtered page count only once and only
647-
// for pageInfos that actually contribute pages.
648-
const sequence = []; // each element is the index into pageInfos
621+
const counts = new Array(pageInfos.length);
622+
const sequence = [];
649623
const insertAfterList = [];
650624
for (let i = 0; i < pageInfos.length; i++) {
651625
const info = pageInfos[i];
652-
if (!info.document || info.pageIndices) {
626+
if (!info.document) {
627+
counts[i] = 0;
628+
continue;
629+
}
630+
const count = (counts[i] = this.#getFilteredPageIndices(info).length);
631+
if (info.pageIndices) {
653632
continue;
654633
}
655-
const count = this.#getFilteredPageIndices(info).length;
656634
if (info.insertAfter === undefined) {
657635
for (let j = 0; j < count; j++) {
658636
sequence.push(i);
@@ -661,50 +639,49 @@ class PDFEditor {
661639
insertAfterList.push({ i, insertAfter: info.insertAfter, count });
662640
}
663641
}
642+
if (insertAfterList.length === 0) {
643+
return pageInfos;
644+
}
664645

665-
insertAfterList.sort((a, b) => a.insertAfter - b.insertAfter);
666-
667-
// Partial pageIndices would auto-fill into free slots at extraction time,
668-
// which collides with the positions insertAfter shifts/assigns. Reject.
669-
if (insertAfterList.length > 0) {
670-
for (const info of pageInfos) {
671-
if (!info.document || !info.pageIndices) {
672-
continue;
673-
}
674-
const filteredCount = this.#getFilteredPageIndices(info).length;
675-
if (info.pageIndices.length < filteredCount) {
676-
throw new Error(
677-
"extractPages: partial pageIndices cannot be combined with insertAfter entries."
678-
);
679-
}
646+
// Partial pageIndices rely on auto-fill in extractPages, which races with
647+
// the slots insertAfter assigns here.
648+
for (let i = 0; i < pageInfos.length; i++) {
649+
const info = pageInfos[i];
650+
if (
651+
info.document &&
652+
info.pageIndices &&
653+
info.pageIndices.length < counts[i]
654+
) {
655+
throw new Error(
656+
"extractPages: partial pageIndices cannot be combined with insertAfter entries."
657+
);
680658
}
681659
}
682660

683-
// If the base sequential sequence is empty but some entries carry explicit
684-
// pageIndices (e.g. after a deletion), resolve each insertAfter against
685-
// that explicit layout: shift existing positions past the insertion point
686-
// and fill the gap. Entries without a document are ignored. With no
687-
// explicit entries we fall through to the sequential branch, whose
688-
// Array.splice already clamps to position 0 on an empty sequence.
689-
const hasExplicitLayout =
690-
insertAfterList.length > 0 &&
691-
pageInfos.some(info => info.document && info.pageIndices);
692-
if (sequence.length === 0 && hasExplicitLayout) {
661+
insertAfterList.sort((a, b) => a.insertAfter - b.insertAfter || a.i - b.i);
662+
663+
// If there is no base sequential sequence, resolve insertAfter against the
664+
// explicit layout. Shift pageIndices values but keep their array order:
665+
// extractPages maps each filtered source page to the corresponding
666+
// pageIndices entry.
667+
if (
668+
sequence.length === 0 &&
669+
pageInfos.some(info => info.document && info.pageIndices)
670+
) {
693671
const updatedPageInfos = pageInfos.slice();
694672
let maxExistingPos = -1;
695673
for (const info of pageInfos) {
696-
if (info.document && info.pageIndices) {
697-
for (const idx of info.pageIndices) {
698-
if (idx > maxExistingPos) {
699-
maxExistingPos = idx;
700-
}
674+
if (!info.document || !info.pageIndices) {
675+
continue;
676+
}
677+
for (const idx of info.pageIndices) {
678+
if (idx > maxExistingPos) {
679+
maxExistingPos = idx;
701680
}
702681
}
703682
}
704683
let offset = 0;
705684
for (const { i, insertAfter, count } of insertAfterList) {
706-
// Clamp to [-1, maxExistingPos] so out-of-range values don't produce
707-
// negative or gap-creating positions.
708685
const threshold = Math.min(
709686
Math.max(insertAfter, -1) + offset,
710687
maxExistingPos
@@ -725,49 +702,39 @@ class PDFEditor {
725702
),
726703
};
727704
}
728-
const insertedIndices = [];
705+
const pageIndices = [];
729706
for (let k = 0; k < count; k++) {
730-
insertedIndices.push(threshold + 1 + k);
707+
pageIndices.push(threshold + 1 + k);
731708
}
732-
const newInfo = {
733-
...updatedPageInfos[i],
734-
pageIndices: insertedIndices,
735-
};
736-
delete newInfo.insertAfter;
737-
updatedPageInfos[i] = newInfo;
709+
const result = { ...updatedPageInfos[i], pageIndices };
710+
delete result.insertAfter;
711+
updatedPageInfos[i] = result;
738712
offset += count;
739713
maxExistingPos += count;
740714
}
741715
return updatedPageInfos;
742716
}
743717

744-
// insertAfter values are relative to the base sequential sequence (stable
745-
// across entries thanks to the sort above); offset converts them to
746-
// current-sequence positions as we splice.
747718
let offset = 0;
748719
for (const { i, insertAfter, count } of insertAfterList) {
749-
const insertPos = insertAfter + 1 + offset;
720+
const insertPos = Math.max(insertAfter, -1) + 1 + offset;
750721
sequence.splice(insertPos, 0, ...new Array(count).fill(i));
751722
offset += count;
752723
}
753724

754-
// Map each pageInfo index to its final positions in the sequence using a
755-
// plain array (keys are dense integers so no need for a Map).
756725
const pageIndicesArr = new Array(pageInfos.length);
757726
for (let pos = 0; pos < sequence.length; pos++) {
758727
const infoIdx = sequence[pos];
759728
(pageIndicesArr[infoIdx] ||= []).push(pos);
760729
}
761730

762-
// Return updated pageInfos: sequential and insertAfter pageInfos now have
763-
// explicit pageIndices; already-indexed pageInfos are left unchanged.
764731
return pageInfos.map((info, i) => {
765732
if (!info.document || info.pageIndices) {
766733
return info;
767734
}
768-
const newInfo = { ...info, pageIndices: pageIndicesArr[i] || [] };
769-
delete newInfo.insertAfter;
770-
return newInfo;
735+
const result = { ...info, pageIndices: pageIndicesArr[i] || [] };
736+
delete result.insertAfter;
737+
return result;
771738
});
772739
}
773740

@@ -792,9 +759,7 @@ class PDFEditor {
792759
task
793760
) {
794761
this.#primaryDocument = primaryDocument;
795-
if (pageInfos.some(info => info.insertAfter !== undefined)) {
796-
pageInfos = this.#resolveInsertAfterIndices(pageInfos);
797-
}
762+
pageInfos = this.#resolveInsertAfterIndices(pageInfos);
798763
const promises = [];
799764
let newIndex = 0;
800765
this.isSingleFile =

test/unit/api_spec.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5584,6 +5584,35 @@ small scripts as well as for`);
55845584
return refs;
55855585
};
55865586

5587+
async function checkPageNumberOrder(pageInfos, expected) {
5588+
let loadingTask = getDocument(
5589+
buildGetDocumentParams("page_with_number.pdf")
5590+
);
5591+
let pdfDoc = await loadingTask.promise;
5592+
const data = await pdfDoc.extractPages(pageInfos);
5593+
await loadingTask.destroy();
5594+
5595+
expect(data).not.toBeNull();
5596+
if (!data) {
5597+
return;
5598+
}
5599+
5600+
loadingTask = getDocument(data);
5601+
pdfDoc = await loadingTask.promise;
5602+
expect(pdfDoc.numPages).toEqual(expected.length);
5603+
5604+
for (let i = 0, ii = expected.length; i < ii; i++) {
5605+
const pageNum = i + 1;
5606+
const pdfPage = await pdfDoc.getPage(pageNum);
5607+
const { items } = await pdfPage.getTextContent();
5608+
expect(mergeText(items))
5609+
.withContext(`Page ${pageNum}`)
5610+
.toEqual(expected[i]);
5611+
}
5612+
5613+
await loadingTask.destroy();
5614+
}
5615+
55875616
describe("Merge pdfs", function () {
55885617
it("should merge three PDFs", async function () {
55895618
const loadingTask = getDocument(
@@ -7423,6 +7452,45 @@ small scripts as well as for`);
74237452
await loadingTask.destroy();
74247453
});
74257454

7455+
it("insertAfter preserves non-monotonic explicit pageIndices", async function () {
7456+
// Explicit pageIndices place page 2 before page 1. The inserted page 3
7457+
// lands after position 0, so the final order is "2", "3", "1".
7458+
await checkPageNumberOrder(
7459+
[
7460+
{ document: null, includePages: [0, 1], pageIndices: [1, 0] },
7461+
{ document: null, includePages: [2], insertAfter: 0 },
7462+
],
7463+
["2", "3", "1"]
7464+
);
7465+
});
7466+
7467+
it("insertAfter shifts only the values past the threshold in non-monotonic pageIndices", async function () {
7468+
// pageIndices [2, 0, 1] place pages 1·2·3 as "2", "3", "1". Inserting
7469+
// page 4 after position 1 shifts only values > 1, so [2, 0, 1] becomes
7470+
// [3, 0, 1] and the final order is "2", "3", "4", "1".
7471+
await checkPageNumberOrder(
7472+
[
7473+
{ document: null, includePages: [0, 1, 2], pageIndices: [2, 0, 1] },
7474+
{ document: null, includePages: [3], insertAfter: 1 },
7475+
],
7476+
["2", "3", "4", "1"]
7477+
);
7478+
});
7479+
7480+
it("insertAfter preserves mixed sequential and explicit layouts", async function () {
7481+
// Sequential pages 1 and 2 form the base sequence; page 3 is inserted
7482+
// after base position 0. The explicit page 4 already accounts for the
7483+
// shifted tail at position 3.
7484+
await checkPageNumberOrder(
7485+
[
7486+
{ document: null, includePages: [0, 1] },
7487+
{ document: null, includePages: [3], pageIndices: [3] },
7488+
{ document: null, includePages: [2], insertAfter: 0 },
7489+
],
7490+
["1", "3", "2", "4"]
7491+
);
7492+
});
7493+
74267494
it("rejects partial pageIndices combined with insertAfter", async function () {
74277495
// Partial pageIndices + insertAfter could race over the same slots;
74287496
// the resolver throws and the worker surfaces it as a null result.

0 commit comments

Comments
 (0)