Skip to content

Commit 813e5a3

Browse files
fix: prioritize extension-based detection over content sniffing in _get_filetype
Move extension check before content-based detection to ensure deterministic behavior across Python versions. Content-based detection in unstructured 0.18.18 may return TXT for plain text content, which could mask the actual file type indicated by the extension. Co-Authored-By: unknown <>
1 parent 5594c29 commit 813e5a3

1 file changed

Lines changed: 6 additions & 6 deletions

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -424,18 +424,18 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
424424
if file_type and file_type != FileType.UNK:
425425
return file_type
426426

427-
type_based_on_content = detect_filetype(file=cast(IO[bytes], file))
428-
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
429-
430-
if type_based_on_content and type_based_on_content != FileType.UNK:
431-
return type_based_on_content
432-
433427
extension = "." + remote_file.uri.split(".")[-1].lower()
434428
try:
435429
return FileType.from_extension(extension)
436430
except ValueError:
437431
pass
438432

433+
type_based_on_content = detect_filetype(file=cast(IO[bytes], file))
434+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
435+
436+
if type_based_on_content and type_based_on_content != FileType.UNK:
437+
return type_based_on_content
438+
439439
return None
440440

441441
def _supported_file_types(self) -> List[Any]:

0 commit comments

Comments
 (0)