1515 clean_pdf_text_to_html ,
1616)
1717
18- # Remove direct PyPDF2 imports from the top
19- # import PyPDF2
20- # from PyPDF2 import PdfReader
18+ # Remove direct pypdf imports from the top
19+ # import pypdf
20+ # from pypdf import PdfReader
2121
2222logger = logging .getLogger (__name__ )
2323
@@ -59,9 +59,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
5959 save_images_locally : bool = False , image_save_dir : Optional [Path ] = None , batch_size : int = 4 ):
6060 # Import check at initialization time
6161 try :
62- import PyPDF2
62+ import pypdf
6363 except ImportError :
64- raise ImportError ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
64+ raise ImportError ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
6565
6666 self .image_dpi = image_dpi
6767 self .image_quality = image_quality
@@ -75,9 +75,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
7575 def process (self , pdf_path : Path ) -> PDFProcessResult :
7676 # Import inside method to allow dependency to be optional
7777 try :
78- from PyPDF2 import PdfReader
78+ from pypdf import PdfReader
7979 except ImportError :
80- raise ImportError ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
80+ raise ImportError ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
8181
8282 start_time = time ()
8383 result = PDFProcessResult (
@@ -125,15 +125,15 @@ def process_batch(self, pdf_path: Path) -> PDFProcessResult:
125125 """Like process() but processes PDF pages in parallel batches"""
126126 # Import inside method to allow dependency to be optional
127127 try :
128- from PyPDF2 import PdfReader
129- import PyPDF2 # For type checking
128+ from pypdf import PdfReader
129+ import pypdf # For type checking
130130 except ImportError :
131- raise ImportError ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
131+ raise ImportError ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
132132
133133 import concurrent .futures
134134 import threading
135135
136- # Initialize PyPDF2 thread support
136+ # Initialize pypdf thread support
137137 if not hasattr (threading .current_thread (), "_children" ):
138138 threading .current_thread ()._children = set ()
139139
@@ -232,11 +232,11 @@ def visitor_text(text, cm, tm, font_dict, font_size):
232232 return pdf_page
233233
234234 def _extract_images (self , page , image_dir : Optional [Path ]) -> List [Dict ]:
235- # Import PyPDF2 for type checking only when needed
235+ # Import pypdf for type checking only when needed
236236 try :
237- import PyPDF2
237+ from pypdf . generic import IndirectObject
238238 except ImportError :
239- raise ImportError ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
239+ raise ImportError ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
240240
241241 if not self .extract_images :
242242 return []
@@ -266,7 +266,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
266266 width = xobj .get ('/Width' , 0 )
267267 height = xobj .get ('/Height' , 0 )
268268 color_space = xobj .get ('/ColorSpace' , '/DeviceRGB' )
269- if isinstance (color_space , PyPDF2 . generic . IndirectObject ):
269+ if isinstance (color_space , IndirectObject ):
270270 color_space = color_space .get_object ()
271271
272272 # Handle different image encodings
@@ -277,7 +277,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
277277 if '/FlateDecode' in filters :
278278 try :
279279 decode_parms = xobj .get ('/DecodeParms' , {})
280- if isinstance (decode_parms , PyPDF2 . generic . IndirectObject ):
280+ if isinstance (decode_parms , IndirectObject ):
281281 decode_parms = decode_parms .get_object ()
282282
283283 predictor = decode_parms .get ('/Predictor' , 1 )
@@ -416,10 +416,10 @@ def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
416416 # Import inside method to allow dependency to be optional
417417 if reader is None :
418418 try :
419- from PyPDF2 import PdfReader
419+ from pypdf import PdfReader
420420 reader = PdfReader (pdf_path )
421421 except ImportError :
422- raise ImportError ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
422+ raise ImportError ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
423423
424424 meta = reader .metadata or {}
425425 created = self ._parse_pdf_date (meta .get ('/CreationDate' , '' ))
@@ -459,11 +459,11 @@ def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
459459 from pathlib import Path
460460
461461 try :
462- # Import PyPDF2 only when running the file directly
463- import PyPDF2
464- from PyPDF2 import PdfReader
462+ # Import pypdf only when running the file directly
463+ import pypdf
464+ from pypdf import PdfReader
465465 except ImportError :
466- print ("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
466+ print ("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'" )
467467 exit (1 )
468468
469469 current_dir = Path (__file__ ).resolve ().parent
0 commit comments