Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,16 @@ def page_index_main(doc, opt=None):
logger.info({'total_token': sum([page[1] for page in page_list])})

async def page_index_builder():
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
outline_structure = get_pdf_outline_tree(doc)
if outline_structure:
logger.info({
'outline_first': True,
'outline_node_count': len(structure_to_list(outline_structure))
})
structure = outline_structure
else:
logger.info({'outline_first': False})
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
if opt.if_add_node_id == 'yes':
write_node_id(structure)
if opt.if_add_node_text == 'yes':
Expand Down Expand Up @@ -1151,4 +1160,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
if truncated_items:
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")

return toc_with_page_number
return toc_with_page_number
92 changes: 92 additions & 0 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,3 +708,95 @@ def print_wrapped(text, width=100):
for line in text.splitlines():
print(textwrap.fill(line, width=width))


def _outline_destination_title(dest) -> str:
title = getattr(dest, "title", None)
if title is None and hasattr(dest, "get"):
title = dest.get("/Title")
return (title or "").replace("\r", "").strip()


def _outline_destination_page(reader, dest) -> int | None:
try:
page = reader.get_destination_page_number(dest) + 1
return page if page > 0 else None
except Exception:
return None


def _parse_pdf_outline_items(reader, items):
nodes = []
i = 0
while i < len(items):
item = items[i]
if isinstance(item, list):
i += 1
continue

node = {
"title": _outline_destination_title(item),
"start_index": _outline_destination_page(reader, item),
"nodes": [],
}

if i + 1 < len(items) and isinstance(items[i + 1], list):
node["nodes"] = _parse_pdf_outline_items(reader, items[i + 1])
if node["start_index"] is None:
for child in node["nodes"]:
if child.get("start_index") is not None:
node["start_index"] = child["start_index"]
break
i += 1

nodes.append(node)
i += 1
return nodes


def _assign_outline_end_indexes(nodes, fallback_end: int) -> None:
for idx, node in enumerate(nodes):
next_start = None
for sibling in nodes[idx + 1:]:
if sibling.get("start_index") is not None:
next_start = sibling["start_index"]
break

candidate_end = (next_start - 1) if next_start else fallback_end
if node.get("start_index") is not None and candidate_end < node["start_index"]:
candidate_end = node["start_index"]

if node["nodes"]:
_assign_outline_end_indexes(node["nodes"], candidate_end)
child_ends = [child.get("end_index") for child in node["nodes"] if child.get("end_index") is not None]
node["end_index"] = max(child_ends) if child_ends else candidate_end
else:
node["end_index"] = candidate_end


def get_pdf_outline_tree(pdf_path):
"""
Build a tree from embedded PDF outline/bookmarks when present.
Returns [] when outline is unavailable or unusable.
"""
try:
reader = PyPDF2.PdfReader(pdf_path)
outline = reader.outline
if not isinstance(outline, list) or len(outline) == 0:
return []

tree = _parse_pdf_outline_items(reader, outline)
tree = [node for node in tree if node.get("title")]
if not tree:
return []

_assign_outline_end_indexes(tree, len(reader.pages))

flat_nodes = structure_to_list(tree)
valid_nodes = [node for node in flat_nodes if node.get("start_index") is not None]
# Sparse outlines are not good enough to replace the normal parser.
if len(valid_nodes) < 5:
return []

return tree
except Exception:
return []
60 changes: 60 additions & 0 deletions tests/test_outline_first.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch

from pageindex.page_index import page_index_main
from pageindex.utils import get_pdf_outline_tree, structure_to_list


class OutlineFirstTests(unittest.TestCase):
def test_embedded_outline_builds_a_usable_tree(self):
pdf_path = Path("examples/documents/PRML.pdf")
if not pdf_path.exists():
self.skipTest(f"missing sample PDF: {pdf_path}")

outline_tree = get_pdf_outline_tree(str(pdf_path))

self.assertIsInstance(outline_tree, list)
self.assertTrue(outline_tree, "expected an outline-first tree for PRML.pdf")

flat_nodes = structure_to_list(outline_tree)
valid_nodes = [node for node in flat_nodes if node.get("start_index") is not None]

self.assertGreaterEqual(len(valid_nodes), 5)
self.assertTrue(all(node["title"] for node in flat_nodes))
self.assertTrue(
all(
node.get("end_index") is not None and node["end_index"] >= node["start_index"]
for node in valid_nodes
)
)

def test_page_index_main_prefers_outline_tree_over_tree_parser(self):
pdf_path = "examples/documents/PRML.pdf"
outline_tree = [{"title": "Outline Root", "start_index": 1, "end_index": 3, "nodes": []}]
opt = SimpleNamespace(
model=None,
if_add_node_id="no",
if_add_node_text="no",
if_add_node_summary="no",
if_add_doc_description="no",
)

tree_parser_mock = AsyncMock(side_effect=AssertionError("tree_parser should not run"))

with patch("pageindex.page_index.get_page_tokens", return_value=[("page", 1)]), \
patch("pageindex.page_index.get_pdf_outline_tree", return_value=outline_tree), \
patch("pageindex.page_index.tree_parser", tree_parser_mock), \
patch("pageindex.page_index.JsonLogger") as logger_cls:
logger = logger_cls.return_value
logger.info.return_value = None

result = page_index_main(pdf_path, opt)

self.assertEqual(result["structure"], outline_tree)
tree_parser_mock.assert_not_awaited()


if __name__ == "__main__":
unittest.main()