Skip to content

Commit df4d87e

Browse files
committed
refactor: replace PyPDF2 with pypdf across the codebase. ref #1412
1 parent f32cfc6 commit df4d87e

File tree

4 files changed

+26
-26
lines changed

4 files changed

+26
-26
lines changed

crawl4ai/processors/pdf/processor.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
clean_pdf_text_to_html,
1616
)
1717

18-
# Remove direct PyPDF2 imports from the top
19-
# import PyPDF2
20-
# from PyPDF2 import PdfReader
18+
# Remove direct pypdf imports from the top
19+
# import pypdf
20+
# from pypdf import PdfReader
2121

2222
logger = logging.getLogger(__name__)
2323

@@ -59,9 +59,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
5959
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
6060
# Import check at initialization time
6161
try:
62-
import PyPDF2
62+
import pypdf
6363
except ImportError:
64-
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
64+
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
6565

6666
self.image_dpi = image_dpi
6767
self.image_quality = image_quality
@@ -75,9 +75,9 @@ def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images
7575
def process(self, pdf_path: Path) -> PDFProcessResult:
7676
# Import inside method to allow dependency to be optional
7777
try:
78-
from PyPDF2 import PdfReader
78+
from pypdf import PdfReader
7979
except ImportError:
80-
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
80+
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
8181

8282
start_time = time()
8383
result = PDFProcessResult(
@@ -125,15 +125,15 @@ def process_batch(self, pdf_path: Path) -> PDFProcessResult:
125125
"""Like process() but processes PDF pages in parallel batches"""
126126
# Import inside method to allow dependency to be optional
127127
try:
128-
from PyPDF2 import PdfReader
129-
import PyPDF2 # For type checking
128+
from pypdf import PdfReader
129+
import pypdf # For type checking
130130
except ImportError:
131-
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
131+
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
132132

133133
import concurrent.futures
134134
import threading
135135

136-
# Initialize PyPDF2 thread support
136+
# Initialize pypdf thread support
137137
if not hasattr(threading.current_thread(), "_children"):
138138
threading.current_thread()._children = set()
139139

@@ -232,11 +232,11 @@ def visitor_text(text, cm, tm, font_dict, font_size):
232232
return pdf_page
233233

234234
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
235-
# Import PyPDF2 for type checking only when needed
235+
# Import pypdf for type checking only when needed
236236
try:
237-
import PyPDF2
237+
from pypdf.generic import IndirectObject
238238
except ImportError:
239-
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
239+
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
240240

241241
if not self.extract_images:
242242
return []
@@ -266,7 +266,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
266266
width = xobj.get('/Width', 0)
267267
height = xobj.get('/Height', 0)
268268
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
269-
if isinstance(color_space, PyPDF2.generic.IndirectObject):
269+
if isinstance(color_space, IndirectObject):
270270
color_space = color_space.get_object()
271271

272272
# Handle different image encodings
@@ -277,7 +277,7 @@ def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
277277
if '/FlateDecode' in filters:
278278
try:
279279
decode_parms = xobj.get('/DecodeParms', {})
280-
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
280+
if isinstance(decode_parms, IndirectObject):
281281
decode_parms = decode_parms.get_object()
282282

283283
predictor = decode_parms.get('/Predictor', 1)
@@ -416,10 +416,10 @@ def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
416416
# Import inside method to allow dependency to be optional
417417
if reader is None:
418418
try:
419-
from PyPDF2 import PdfReader
419+
from pypdf import PdfReader
420420
reader = PdfReader(pdf_path)
421421
except ImportError:
422-
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
422+
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
423423

424424
meta = reader.metadata or {}
425425
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -459,11 +459,11 @@ def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
459459
from pathlib import Path
460460

461461
try:
462-
# Import PyPDF2 only when running the file directly
463-
import PyPDF2
464-
from PyPDF2 import PdfReader
462+
# Import pypdf only when running the file directly
463+
import pypdf
464+
from pypdf import PdfReader
465465
except ImportError:
466-
print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
466+
print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
467467
exit(1)
468468

469469
current_dir = Path(__file__).resolve().parent

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,13 @@ classifiers = [
5959
]
6060

6161
[project.optional-dependencies]
62-
pdf = ["PyPDF2"]
62+
pdf = ["pypdf"]
6363
torch = ["torch", "nltk", "scikit-learn"]
6464
transformer = ["transformers", "tokenizers", "sentence-transformers"]
6565
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
6666
sync = ["selenium"]
6767
all = [
68-
"PyPDF2",
68+
"pypdf",
6969
"torch",
7070
"nltk",
7171
"scikit-learn",

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,4 @@ shapely>=2.0.0
3333

3434
fake-useragent>=2.2.0
3535
pdf2image>=1.17.0
36-
PyPDF2>=3.0.1
36+
pypdf>=6.0.0

tests/check_dependencies.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
'sentence_transformers': 'sentence-transformers',
7272
'rank_bm25': 'rank-bm25',
7373
'snowballstemmer': 'snowballstemmer',
74-
'PyPDF2': 'PyPDF2',
74+
'pypdf': 'pypdf',
7575
'pdf2image': 'pdf2image',
7676
}
7777

0 commit comments

Comments
 (0)