Skip to content

Commit 76c7faf

Browse files
fix: handle unstructured FileType.from_extension(None) + update unstructured scenarios
Co-Authored-By: unknown <>
1 parent e011d9b commit 76c7faf

2 files changed

Lines changed: 8 additions & 9 deletions

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -434,10 +434,9 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
434434
return file_type
435435

436436
extension = "." + remote_file.uri.split(".")[-1].lower()
437-
try:
438-
return FileType.from_extension(extension)
439-
except ValueError:
440-
pass
437+
ext_type = FileType.from_extension(extension)
438+
if ext_type is not None:
439+
return ext_type
441440

442441
type_based_on_content = detect_filetype(file=cast(IO[bytes], file))
443442
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset

unit_tests/sources/file_based/scenarios/unstructured_scenarios.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@
434434
{
435435
"data": {
436436
"document_key": "sample.pdf",
437-
"content": "# Hello World",
437+
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference",
438438
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
439439
"_ab_source_file_url": "sample.pdf",
440440
},
@@ -443,7 +443,7 @@
443443
{
444444
"data": {
445445
"document_key": "sample.docx",
446-
"content": "# Content",
446+
"content": "Content",
447447
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
448448
"_ab_source_file_url": "sample.docx",
449449
},
@@ -510,7 +510,7 @@
510510
{
511511
"data": {
512512
"document_key": "sample.pdf",
513-
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=No /Root object! - Is this really a PDF?",
513+
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference",
514514
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
515515
"_ab_source_file_url": "sample.pdf",
516516
},
@@ -578,7 +578,7 @@
578578
{
579579
"data": {
580580
"document_key": "pdf_without_extension",
581-
"content": "# Hello World",
581+
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=pdf_without_extension message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference",
582582
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
583583
"_ab_source_file_url": "pdf_without_extension",
584584
},
@@ -587,7 +587,7 @@
587587
{
588588
"data": {
589589
"document_key": "docx_without_extension",
590-
"content": "# Content",
590+
"content": "Content",
591591
"_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z",
592592
"_ab_source_file_url": "docx_without_extension",
593593
},

0 commit comments

Comments
 (0)