Skip to content

Commit 0651862

Browse files
fix: handle partition_pdf import failure gracefully in unstructured 0.18.18
partition_pdf now requires unstructured_inference package which may not be installed. Make the import optional and check availability only when actually processing PDF files. MD/TXT files don't need partition functions and should not fail due to missing PDF dependencies. Co-Authored-By: unknown <>
1 parent 813e5a3 commit 0651862

1 file changed

Lines changed: 18 additions & 18 deletions

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,18 @@ def _import_unstructured() -> None:
7979
global unstructured_partition_docx
8080
global unstructured_partition_pptx
8181
from unstructured.partition.docx import partition_docx
82-
from unstructured.partition.pdf import partition_pdf
8382
from unstructured.partition.pptx import partition_pptx
8483

85-
# separate global variables to properly propagate typing
86-
unstructured_partition_pdf = partition_pdf
8784
unstructured_partition_docx = partition_docx
8885
unstructured_partition_pptx = partition_pptx
8986

87+
try:
88+
from unstructured.partition.pdf import partition_pdf
89+
90+
unstructured_partition_pdf = partition_pdf
91+
except (ImportError, ModuleNotFoundError):
92+
pass
93+
9094

9195
def user_error(e: Exception) -> bool:
9296
"""
@@ -201,13 +205,6 @@ def _read_file(
201205
logger: logging.Logger,
202206
) -> str:
203207
_import_unstructured()
204-
if (
205-
(not unstructured_partition_pdf)
206-
or (not unstructured_partition_docx)
207-
or (not unstructured_partition_pptx)
208-
):
209-
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
210-
raise Exception("unstructured library is not available")
211208

212209
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
213210

@@ -350,13 +347,6 @@ def _read_file_locally(
350347
self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
351348
) -> str:
352349
_import_unstructured()
353-
if (
354-
(not unstructured_partition_pdf)
355-
or (not unstructured_partition_docx)
356-
or (not unstructured_partition_pptx)
357-
):
358-
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
359-
raise Exception("unstructured library is not available")
360350

361351
file: Any = file_handle
362352

@@ -367,15 +357,25 @@ def _read_file_locally(
367357

368358
try:
369359
if filetype == FileType.PDF:
370-
# for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
360+
if not unstructured_partition_pdf:
361+
raise self._create_parse_error(
362+
remote_file,
363+
"PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference",
364+
)
371365
file_handle.seek(0)
372366
with BytesIO(file_handle.read()) as file:
373367
file_handle.seek(0)
374368
elements = unstructured_partition_pdf(file=file, strategy=strategy)
375369
elif filetype == FileType.DOCX:
370+
if not unstructured_partition_docx:
371+
raise self._create_parse_error(remote_file, "DOCX partition function is not available")
376372
elements = unstructured_partition_docx(file=file)
377373
elif filetype == FileType.PPTX:
374+
if not unstructured_partition_pptx:
375+
raise self._create_parse_error(remote_file, "PPTX partition function is not available")
378376
elements = unstructured_partition_pptx(file=file)
377+
except RecordParseError:
378+
raise
379379
except Exception as e:
380380
raise self._create_parse_error(remote_file, str(e))
381381

0 commit comments

Comments
 (0)