Skip to content

Commit 23b2ec8

Browse files
IgorTavcarclaude
andcommitted
Add first_page/last_page parameters to load_pdf
Allows processing a subset of pages by specifying a 1-indexed page range. Uses slice on the pages list instead of iterating all pages and skipping, and passes the range to pdf2image for image loading. Based on: Layout-Parser#190 (AIexanderDicke) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6c50c51 commit 23b2ec8

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

src/layoutparser/io/pdf.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ def load_pdf(
9595
vertical_ttb: bool = True,
9696
extra_attrs: Optional[List[str]] = None,
9797
dpi: int = DEFAULT_PDF_DPI,
98+
first_page: Optional[int] = None,
99+
last_page: Optional[int] = None,
98100
) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]:
99101
"""Load all tokens for each page from a PDF file, and save them
100102
in a list of Layout objects with the original page order.
@@ -163,6 +165,10 @@ def load_pdf(
163165
pdf_layouts, it can be rendered appropriately.
164166
Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
165167
from the pdfplumber PDF parser.
168+
first_page (int, optional):
169+
First page to process (1-indexed). Defaults to None (start from first page).
170+
last_page (int, optional):
171+
Last page to process (1-indexed, inclusive). Defaults to None (process to last page).
166172
167173
Returns:
168174
List[Layout]:
@@ -188,11 +194,14 @@ def load_pdf(
188194

189195
plumber_pdf_object = pdfplumber.open(filename)
190196

197+
start = (first_page - 1) if first_page is not None else None
198+
end = last_page if last_page is not None else None
199+
pages = plumber_pdf_object.pages[start:end]
200+
191201
all_page_layout = []
192202

193203
with plumber_pdf_object:
194-
for page_id in range(len(plumber_pdf_object.pages)):
195-
cur_page = plumber_pdf_object.pages[page_id]
204+
for page_id, cur_page in enumerate(pages, start=(start or 0)):
196205

197206
page_tokens = extract_words_for_page(
198207
cur_page,
@@ -218,11 +227,17 @@ def load_pdf(
218227
import pdf2image
219228

220229
if is_path:
221-
pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
230+
pdf_images = pdf2image.convert_from_path(
231+
filename, dpi=dpi,
232+
first_page=first_page, last_page=last_page,
233+
)
222234
else:
223235
if isinstance(filename, BytesIO):
224236
filename.seek(0)
225-
pdf_images = pdf2image.convert_from_bytes(filename.read() if hasattr(filename, 'read') else filename, dpi=dpi)
237+
pdf_images = pdf2image.convert_from_bytes(
238+
filename.read() if hasattr(filename, 'read') else filename,
239+
dpi=dpi, first_page=first_page, last_page=last_page,
240+
)
226241

227242
for page_id, page_image in enumerate(pdf_images):
228243
image_width, image_height = page_image.size

tests/test_io.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,18 @@ def test_pdf_with_file_object():
128128

129129
page_layout = pdf_layout[0]
130130
for attr_name in ["width", "height", "index"]:
131-
assert attr_name in page_layout.page_data
131+
assert attr_name in page_layout.page_data
132+
133+
134+
def test_pdf_page_range():
135+
# example.pdf has 1 page, so first_page=1, last_page=1 should return it
136+
pdf_layout = load_pdf("tests/fixtures/io/example.pdf", first_page=1, last_page=1)
137+
assert len(pdf_layout) == 1
138+
139+
page_layout = pdf_layout[0]
140+
for attr_name in ["width", "height", "index"]:
141+
assert attr_name in page_layout.page_data
142+
143+
# Requesting a page range beyond the document returns empty
144+
pdf_layout = load_pdf("tests/fixtures/io/example.pdf", first_page=2)
145+
assert len(pdf_layout) == 0

0 commit comments

Comments
 (0)