Skip to content

Commit 7ba8538

Browse files
fix: improve markdown file detection for unstructured parser
Co-Authored-By: Aaron <AJ> Steers <aj@airbyte.io>
1 parent 08a365b commit 7ba8538

1 file changed

Lines changed: 15 additions & 1 deletion

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,21 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
432432
return detected_type
433433

434434
file.seek(0)
435-
type_based_on_content = FileType.UNK # Default to unknown
435+
try:
436+
file_content = file.read()
437+
file.seek(0)
438+
if file_content and isinstance(file_content, bytes):
439+
content_str = file_content.decode('utf-8', errors='ignore')
440+
if content_str.lstrip().startswith('#'):
441+
type_based_on_content = FileType.MD
442+
elif remote_file.mime_type == "text/markdown":
443+
type_based_on_content = FileType.MD
444+
else:
445+
type_based_on_content = FileType.UNK
446+
else:
447+
type_based_on_content = FileType.UNK
448+
except Exception:
449+
type_based_on_content = FileType.UNK
436450
file.seek(0) # Reset file position after reading
437451

438452
if type_based_on_content and type_based_on_content != FileType.UNK:

0 commit comments

Comments
 (0)