@@ -95,6 +95,8 @@ def load_pdf(
9595 vertical_ttb : bool = True ,
9696 extra_attrs : Optional [List [str ]] = None ,
9797 dpi : int = DEFAULT_PDF_DPI ,
98+ first_page : Optional [int ] = None ,
99+ last_page : Optional [int ] = None ,
98100) -> Union [List [Layout ], Tuple [List [Layout ], List ["Image.Image" ]]]:
99101 """Load all tokens for each page from a PDF file, and save them
100102 in a list of Layout objects with the original page order.
@@ -163,6 +165,10 @@ def load_pdf(
163165 pdf_layouts, it can be rendered appropriately.
164166 Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
165167 from the pdfplumber PDF parser.
168+ first_page (int, optional):
169+ First page to process (1-indexed). Defaults to None (start from first page).
170+ last_page (int, optional):
171+ Last page to process (1-indexed, inclusive). Defaults to None (process to last page).
166172
167173 Returns:
168174 List[Layout]:
@@ -188,11 +194,14 @@ def load_pdf(
188194
189195 plumber_pdf_object = pdfplumber .open (filename )
190196
197+ start = (first_page - 1 ) if first_page is not None else None
198+ end = last_page if last_page is not None else None
199+ pages = plumber_pdf_object .pages [start :end ]
200+
191201 all_page_layout = []
192202
193203 with plumber_pdf_object :
194- for page_id in range (len (plumber_pdf_object .pages )):
195- cur_page = plumber_pdf_object .pages [page_id ]
204+ for page_id , cur_page in enumerate (pages , start = (start or 0 )):
196205
197206 page_tokens = extract_words_for_page (
198207 cur_page ,
@@ -218,11 +227,17 @@ def load_pdf(
218227 import pdf2image
219228
220229 if is_path :
221- pdf_images = pdf2image .convert_from_path (filename , dpi = dpi )
230+ pdf_images = pdf2image .convert_from_path (
231+ filename , dpi = dpi ,
232+ first_page = first_page , last_page = last_page ,
233+ )
222234 else :
223235 if isinstance (filename , BytesIO ):
224236 filename .seek (0 )
225- pdf_images = pdf2image .convert_from_bytes (filename .read () if hasattr (filename , 'read' ) else filename , dpi = dpi )
237+ pdf_images = pdf2image .convert_from_bytes (
238+ filename .read () if hasattr (filename , 'read' ) else filename ,
239+ dpi = dpi , first_page = first_page , last_page = last_page ,
240+ )
226241
227242 for page_id , page_image in enumerate (pdf_images ):
228243 image_width , image_height = page_image .size
0 commit comments