diff --git a/CHANGELOG.md b/CHANGELOG.md index c3d1b1e0..72b15846 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format ## [0.11.7] - 2025-06-12 +## [Unreleased] + +### Added +- Added `PDF.table_of_contents` and `Page.table_of_contents` properties to expose document outlines (bookmarks) directly through pdfplumber. + This enables easy access to a document’s Table of Contents for navigation or metadata extraction. + ([#1034](https://github.com/jsvine/pdfplumber/issues/1034) by @AbdullahMehmoodAwan) + ### Added - Add access to `Page.trimbox`, `Page.bleedbox`, and `Page.artbox` (h/t @samuelbradshaw). ([#1313](https://github.com/jsvine/pdfplumber/issues/1313) + [7e364e6](https://github.com/jsvine/pdfplumber/commit/7e364e6193c6e8bafa9b46587c0fdd4a46405399)) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 286e7e15..c0ba5e0a 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -251,6 +251,15 @@ def structure_tree(self) -> List[Dict[str, Any]]: return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)] except StructTreeMissing: return [] + + @property + def table_of_contents(self): + """ + Returns the document-level Table of Contents. + This is the same as pdfplumber.PDF.table_of_contents, but accessible from a page. + """ + return self.pdf.table_of_contents + @property def layout(self) -> LTPage: diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 9c42bc25..5fc5ce53 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -203,3 +203,31 @@ def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: "metadata": self.metadata, "pages": [page.to_dict(object_types) for page in self.pages], } + + @property + def table_of_contents(self) -> List[Dict[str, Any]]: + """ + Returns the document's outline (Table of Contents) if available. + Each entry is represented as a dictionary: + {"title": str, "page_number": int or None}. + """ + outlines: List[Dict[str, Any]] = [] + try: + if hasattr(self.doc, "get_outlines"): + for (level, title, dest, a, se) in self.doc.get_outlines(): + page_number = None + # Get page number safely if destination is valid + if dest and hasattr(dest, "page") and dest.page: + try: + page_number = self.doc.pageid2num(dest.page.idnum) + except Exception: + pass + outlines.append({ + "title": title, + "page_number": page_number, + "level": level + }) + except Exception as e: + logger.debug(f"Unable to extract outlines: {e}") + return outlines + diff --git a/tests/pdfs/toc-sample.pdf b/tests/pdfs/toc-sample.pdf new file mode 100644 index 00000000..17e2566e Binary files /dev/null and b/tests/pdfs/toc-sample.pdf differ diff --git a/tests/test_table_of_contents.py b/tests/test_table_of_contents.py new file mode 100644 index 00000000..0c538178 --- /dev/null +++ b/tests/test_table_of_contents.py @@ -0,0 +1,24 @@ +""" +Test for PDF.table_of_contents and Page.table_of_contents properties +""" +import pdfplumber + +def test_table_of_contents_property(): + # Path to your sample PDF (must exist) + sample_pdf_path = "tests/pdfs/toc-sample.pdf" + + with pdfplumber.open(sample_pdf_path) as pdf: + toc = pdf.table_of_contents + + # 1. Check the property exists and is a list + assert isinstance(toc, list) + + # 2. If TOC entries exist, ensure they contain the right keys + if toc: + entry = toc[0] + assert "title" in entry + assert "level" in entry + assert "page_number" in entry + + # 3. Verify the Page.table_of_contents matches PDF.table_of_contents + assert toc == pdf.pages[0].table_of_contents