Skip to content

Commit a8f4e04

Browse files
fix: wrap MD/TXT content in JSON element when output_format is markdown_json
When output_format='markdown_json', MD and TXT files now return a JSON array with a single NarrativeText element instead of raw text, ensuring consistent output format across all file types. Co-Authored-By: Ryan Waskewich <ryan.waskewich@airbyte.io>
1 parent 1afcc98 commit a8f4e04

2 files changed

Lines changed: 41 additions & 0 deletions

File tree

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ def _read_file(
233233
if filetype in {FileType.MD, FileType.TXT}:
234234
file_content: bytes = file_handle.read()
235235
decoded_content: str = optional_decode(file_content)
236+
if format.output_format == "markdown_json":
237+
return json.dumps(
238+
[{"type": "NarrativeText", "text": decoded_content, "metadata": {}}]
239+
)
236240
return decoded_content
237241
if format.processing.mode == "local":
238242
elements = self._read_file_locally_elements(

unit_tests/sources/file_based/file_types/test_unstructured_parser.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -800,3 +800,40 @@ def test_parse_records_markdown_json_remote(
800800
# Verify content is valid JSON matching the API response
801801
content = json.loads(records[0]["content"])
802802
assert content == json_response
803+
804+
805+
@patch("unstructured.partition.pdf.partition_pdf")
806+
@patch("unstructured.partition.pptx.partition_pptx")
807+
@patch("unstructured.partition.docx.partition_docx")
808+
@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype")
809+
def test_parse_records_markdown_json_md_file(
810+
mock_detect_filetype,
811+
mock_partition_docx,
812+
mock_partition_pptx,
813+
mock_partition_pdf,
814+
):
815+
"""Test that MD/TXT files return a JSON element array when output_format is markdown_json."""
816+
stream_reader = MagicMock()
817+
mock_open(stream_reader.open_file, read_data=b"# Hello World\n\nSome text content")
818+
fake_file = RemoteFile(uri="path/to/file.md", last_modified=datetime.now())
819+
logger = MagicMock()
820+
config = MagicMock()
821+
config.format = UnstructuredFormat(
822+
skip_unprocessable_files=False,
823+
output_format="markdown_json",
824+
)
825+
mock_detect_filetype.return_value = FileType.MD
826+
827+
records = list(
828+
UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())
829+
)
830+
assert len(records) == 1
831+
assert records[0]["document_key"] == "path/to/file.md"
832+
assert records[0]["_ab_source_file_parse_error"] is None
833+
# Verify content is valid JSON with a single NarrativeText element
834+
content = json.loads(records[0]["content"])
835+
assert isinstance(content, list)
836+
assert len(content) == 1
837+
assert content[0]["type"] == "NarrativeText"
838+
assert content[0]["text"] == "# Hello World\n\nSome text content"
839+
assert content[0]["metadata"] == {}

0 commit comments

Comments
 (0)