Skip to content

Commit 763050f

Browse files
fix(cdk): upgrade unstructured from 0.10.27 to 0.18.18 to fix CVE-2025-64712
Upgrades the unstructured library to address critical path traversal vulnerability GHSA-gm8q-m8mv-jj5m (CVSS 9.8) in partition_msg. Changes: - Update unstructured dependency from 0.10.27 to 0.18.18 - Add pi-heif dependency required by new unstructured version - Adapt unstructured_parser.py to new API: - Replace removed EXT_TO_FILETYPE/STR_TO_FILETYPE/FILETYPE_TO_MIMETYPE with FileType.from_extension()/from_mime_type()/mime_type property - Update detect_filetype() parameter from filename= to file_path= - Update test mocks to match new API surface Co-Authored-By: unknown <>
1 parent e9144e2 commit 763050f

4 files changed

Lines changed: 340 additions & 104 deletions

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,7 @@
1212
import dpath
1313
import nltk
1414
import requests
15-
from unstructured.file_utils.filetype import (
16-
EXT_TO_FILETYPE,
17-
FILETYPE_TO_MIMETYPE,
18-
STR_TO_FILETYPE,
19-
FileType,
20-
detect_filetype,
21-
)
15+
from unstructured.file_utils.filetype import FileType, detect_filetype
2216

2317
from airbyte_cdk.models import FailureType
2418
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
@@ -335,7 +329,7 @@ def _read_file_remotely(
335329

336330
data = self._params_to_dict(format.parameters, strategy)
337331

338-
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
332+
file_data = {"files": ("filename", file_handle, filetype.mime_type)}
339333

340334
response = requests.post(
341335
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
@@ -405,8 +399,11 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
405399
2. Use the file name if available
406400
3. Use the file content
407401
"""
408-
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
409-
return STR_TO_FILETYPE[remote_file.mime_type]
402+
if remote_file.mime_type:
403+
try:
404+
return FileType.from_mime_type(remote_file.mime_type)
405+
except ValueError:
406+
pass
410407

411408
# set name to none, otherwise unstructured will try to get the modified date from the local file system
412409
if hasattr(file, "name"):
@@ -418,7 +415,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
418415
file_type: FileType | None = None
419416
try:
420417
file_type = detect_filetype(
421-
filename=remote_file.uri,
418+
file_path=remote_file.uri,
422419
)
423420
except Exception:
424421
# Path doesn't exist locally. Try something else...
@@ -434,8 +431,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
434431
return type_based_on_content
435432

436433
extension = "." + remote_file.uri.split(".")[-1].lower()
437-
if extension in EXT_TO_FILETYPE:
438-
return EXT_TO_FILETYPE[extension]
434+
try:
435+
return FileType.from_extension(extension)
436+
except ValueError:
437+
pass
439438

440439
return None
441440

0 commit comments

Comments
 (0)