refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

ntohidi · ntohidi · commit df4d87ed78a0 · 2025-12-03T10:59:18.000+01:00
diff --git a/crawl4ai/processors/pdf/processor.py b/crawl4ai/processors/pdf/processor.py
@@ -15,9 +15,9 @@
     clean_pdf_text_to_html,
 )
 
-# Remove direct PyPDF2 imports from the top
-# import PyPDF2
-# from PyPDF2 import PdfReader
+# Remove direct pypdf imports from the top
+# import pypdf
+# from pypdf import PdfReader
 
 logger = logging.getLogger(__name__)
 
@@ -59,9 +59,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
                  save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
         # Import check at initialization time
         try:
-            import PyPDF2
+            import pypdf
         except ImportError:
-            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
             
         self.image_dpi = image_dpi
         self.image_quality = image_quality
@@ -75,9 +75,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
     def process(self, pdf_path: Path) -> PDFProcessResult:
         # Import inside method to allow dependency to be optional
         try:
-            from PyPDF2 import PdfReader
+            from pypdf import PdfReader
         except ImportError:
-            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
             
         start_time = time()
         result = PDFProcessResult(
@@ -125,15 +125,15 @@ def process_batch(self, pdf_path: Path) -> PDFProcessResult:
         """Like process() but processes PDF pages in parallel batches"""
         # Import inside method to allow dependency to be optional
         try:
-            from PyPDF2 import PdfReader
-            import PyPDF2  # For type checking
+            from pypdf import PdfReader
+            import pypdf  # For type checking
         except ImportError:
-            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
             
         import concurrent.futures
         import threading
         
-        # Initialize PyPDF2 thread support
+        # Initialize pypdf thread support
         if not hasattr(threading.current_thread(), "_children"): 
             threading.current_thread()._children = set()
         
@@ -232,11 +232,11 @@ def visitor_text(text, cm, tm, font_dict, font_size):
         return pdf_page
 
     def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
-        # Import PyPDF2 for type checking only when needed
+        # Import pypdf for type checking only when needed
         try:
-            import PyPDF2
+            from pypdf.generic import IndirectObject
         except ImportError:
-            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
             
         if not self.extract_images:
             return []
@@ -266,7 +266,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
                                     width = xobj.get('/Width', 0)
                                     height = xobj.get('/Height', 0)
                                     color_space = xobj.get('/ColorSpace', '/DeviceRGB')
-                                    if isinstance(color_space, PyPDF2.generic.IndirectObject):
+                                    if isinstance(color_space, IndirectObject):
                                         color_space = color_space.get_object()
 
                                     # Handle different image encodings
@@ -277,7 +277,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
                                     if '/FlateDecode' in filters:
                                         try:
                                             decode_parms = xobj.get('/DecodeParms', {})
-                                            if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
+                                            if isinstance(decode_parms, IndirectObject):
                                                 decode_parms = decode_parms.get_object()
                                             
                                             predictor = decode_parms.get('/Predictor', 1)
@@ -416,10 +416,10 @@ def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
         # Import inside method to allow dependency to be optional 
         if reader is None:
             try:
-                from PyPDF2 import PdfReader
+                from pypdf import PdfReader
                 reader = PdfReader(pdf_path)
             except ImportError:
-                raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+                raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
 
         meta = reader.metadata or {}
         created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -459,11 +459,11 @@ def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
     from pathlib import Path
     
     try:
-        # Import PyPDF2 only when running the file directly
-        import PyPDF2
-        from PyPDF2 import PdfReader
+        # Import pypdf only when running the file directly
+        import pypdf
+        from pypdf import PdfReader
     except ImportError:
-        print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+        print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
         exit(1)
         
     current_dir = Path(__file__).resolve().parent
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,13 +59,13 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-pdf = ["PyPDF2"]  
+pdf = ["pypdf"]  
 torch = ["torch", "nltk", "scikit-learn"]
 transformer = ["transformers", "tokenizers", "sentence-transformers"]
 cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
 sync = ["selenium"]
 all = [
-    "PyPDF2",
+    "pypdf",
     "torch",
     "nltk",
     "scikit-learn",
diff --git a/requirements.txt b/requirements.txt
@@ -33,4 +33,4 @@ shapely>=2.0.0
 
 fake-useragent>=2.2.0
 pdf2image>=1.17.0
-PyPDF2>=3.0.1
+pypdf>=6.0.0
diff --git a/tests/check_dependencies.py b/tests/check_dependencies.py
@@ -71,7 +71,7 @@
     'sentence_transformers': 'sentence-transformers',
     'rank_bm25': 'rank-bm25',
     'snowballstemmer': 'snowballstemmer',
-    'PyPDF2': 'PyPDF2',
+    'pypdf': 'pypdf',
     'pdf2image': 'pdf2image',
 }
 

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@`
`71`	`71`	`'sentence_transformers': 'sentence-transformers',`
`72`	`72`	`'rank_bm25': 'rank-bm25',`
`73`	`73`	`'snowballstemmer': 'snowballstemmer',`
`74`		`- 'PyPDF2': 'PyPDF2',`
	`74`	`+ 'pypdf': 'pypdf',`
`75`	`75`	`'pdf2image': 'pdf2image',`
`76`	`76`	`}`
`77`	`77`