From 6d3538043b8b91753266cd0f29916a834e327ceb Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu Date: Mon, 6 Apr 2026 15:21:55 +0200 Subject: [PATCH 01/13] fixed bug where MarkdownHeaderSplitter's split result missed the first direct parent's header in the metadata and added lark to pyproject.toml --- .../preprocessors/markdown_header_splitter.py | 11 +++-------- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 21311504e6..890fc1f279 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -147,7 +147,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # process headers and build chunks chunks: list[dict] = [] header_stack: list[str | None] = [None] * 6 - active_parents: list[str] = [] # track active parent headers pending_headers: list[str] = [] # store empty headers to prepend to next content has_content = False # flag to track if any header has content @@ -169,16 +168,15 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # skip splits w/o content if not content.strip(): # this strip is needed to avoid counting whitespace as content - # add as parent for subsequent headers - active_parents = [h for h in header_stack[: level - 1] if h is not None] - active_parents.append(header_text) if self.keep_headers: header_line = f"{header_prefix} {header_text}" pending_headers.append(header_line) continue has_content = True # at least one header has content - parent_headers = list(active_parents) + # Build parent metadata from the current header stack so the first child of a + # contentful section still inherits its full ancestor chain. + parent_headers = [h for h in header_stack[: level - 1] if h is not None] logger.debug( "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level @@ -198,9 +196,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: else: chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}}) - # reset active parents - active_parents = [h for h in header_stack[: level - 1] if h is not None] - # return doc unchunked if no headers have content if not has_content: logger.info( diff --git a/pyproject.toml b/pyproject.toml index 2f03c5ad6f..20f3946e08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "docstring-parser", # ComponentTool "filetype", # MIME type guessing for ImageContent "haystack-experimental", + "lark", ] [tool.hatch.envs.default] From ced2666b4b58bf9d4a3de286fe8baf628f501202 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu Date: Mon, 6 Apr 2026 16:02:37 +0200 Subject: [PATCH 02/13] added release note --- ...rkdownHeaderSplitter-b5db96e19011b6b9.yaml | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml new file mode 100644 index 0000000000..bca4965979 --- /dev/null +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -0,0 +1,69 @@ +--- +features: + - | + Added **lark** to pyproject.toml. +issues: + - | + When using the **MarkdownHeaderSplitter**, in the split chunks, the child header always loses + its direct parent header in the metadata. One can execute the code below: + + .. code:: python + from haystack.components.preprocessors import MarkdownHeaderSplitter + from haystack import Document + + text = """ + # header 1 + intro text + + ## header 1.1 + text 1 + + ## header 1.2 + text 2 + + ### header 1.2.1 + text 3 + + ### header 1.2.2 + text 4 + """ + + document = Document(content=text) + + splitter = MarkdownHeaderSplitter( + keep_headers=True, + secondary_split="word" + ) + result = splitter.run(documents=[document])["documents"] + + for doc in result: + print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}") + + This output is expected: + + .. code:: text + + Header: header 1, parent headers: [] + Header: header 1.1, parent headers: ['header 1'] + Header: header 1.2, parent headers: ['header 1'] + Header: header 1.2.1, parent headers: ['header 1', 'header 1.2'] + Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] + + But the actual output is: + + .. code:: text + Header: header 1, parent headers: [] + Header: header 1.1, parent headers: [] + Header: header 1.2, parent headers: ['header 1'] + Header: header 1.2.1, parent headers: ['header 1'] + Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] + +fixes: + - | + The error happens when a parent header has its own content chunk before the first + child. In that path the code clears active_parents, so the first child loses its + ancestor while later siblings inherit it correctly. + + The fix is to derive parent_headers from the current header_stack instead of + carrying a separate mutable “active parents” list that gets out of sync after contentful + headers. From d78d8371203abb30bb17702feb2759735412da2f Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu Date: Thu, 9 Apr 2026 13:53:03 +0200 Subject: [PATCH 03/13] reverted changes of adding "lark" --- pyproject.toml | 1 - ...t-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 3 --- 2 files changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 20f3946e08..2f03c5ad6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,6 @@ dependencies = [ "docstring-parser", # ComponentTool "filetype", # MIME type guessing for ImageContent "haystack-experimental", - "lark", ] [tool.hatch.envs.default] diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index bca4965979..328a3614b6 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -1,7 +1,4 @@ --- -features: - - | - Added **lark** to pyproject.toml. issues: - | When using the **MarkdownHeaderSplitter**, in the split chunks, the child header always loses From 9f4306943c642bebe645c778b955659881a63993 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu Date: Thu, 9 Apr 2026 13:54:54 +0200 Subject: [PATCH 04/13] collapsed release note to contain only "fixes" --- ...-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index 328a3614b6..ddccf3e22d 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -1,5 +1,5 @@ --- -issues: +fixes: - | When using the **MarkdownHeaderSplitter**, in the split chunks, the child header always loses its direct parent header in the metadata. One can execute the code below: @@ -55,8 +55,6 @@ issues: Header: header 1.2.1, parent headers: ['header 1'] Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] -fixes: - - | The error happens when a parent header has its own content chunk before the first child. In that path the code clears active_parents, so the first child loses its ancestor while later siblings inherit it correctly. From dae77ef92b8221e5d069ec9abe29e02e31a4b6ae Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu Date: Thu, 9 Apr 2026 14:03:03 +0200 Subject: [PATCH 05/13] added test "test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child" to prove my concept. --- .../test_markdown_header_splitter.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index a8bfbaf976..58209aaa74 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -91,6 +91,35 @@ def test_basic_split(sample_text): assert reconstructed_doc == sample_text +def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child(): + text = ( + "# Header 1\n" + "Intro text\n" + "\n" + "## Header 1.1\n" + "Text 1\n" + "\n" + "## Header 1.2\n" + "Text 2\n" + "\n" + "### Header 1.2.1\n" + "Text 3\n" + "\n" + "### Header 1.2.2\n" + "Text 4\n" + ) + splitter = MarkdownHeaderSplitter(keep_headers=True, secondary_split="word") + split_docs = splitter.run(documents=[Document(content=text)])["documents"] + + assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [ + ("Header 1", []), + ("Header 1.1", ["Header 1"]), + ("Header 1.2", ["Header 1"]), + ("Header 1.2.1", ["Header 1", "Header 1.2"]), + ("Header 1.2.2", ["Header 1", "Header 1.2"]), + ] + + def test_split_without_headers(sample_text): splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content=sample_text)] From be4dd40e30af7e114fb51de4e3d110178b3bfbdf Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:32:40 +0200 Subject: [PATCH 06/13] Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml written description in past tense Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- ...-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index ddccf3e22d..ad3205568e 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -1,8 +1,8 @@ --- fixes: - | - When using the **MarkdownHeaderSplitter**, in the split chunks, the child header always loses - its direct parent header in the metadata. One can execute the code below: + When using the **MarkdownHeaderSplitter**, in the split chunks, the child header previously lost + its direct parent header in the metadata. Previously if one executed the code below: .. code:: python from haystack.components.preprocessors import MarkdownHeaderSplitter From b50296a916fcac7b9afcefdd80c0aa6aa76835f8 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:32:52 +0200 Subject: [PATCH 07/13] Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml written description in past tense Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- ...nt-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index ad3205568e..4577c4d5c7 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -36,7 +36,7 @@ fixes: for doc in result: print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}") - This output is expected: + We would have expected this output: .. code:: text From 450b69f483898659586024e847cabf2d7654fd0c Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:33:02 +0200 Subject: [PATCH 08/13] Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml written description in past tense Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- ...nt-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index 4577c4d5c7..dd12591ae2 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -46,7 +46,7 @@ fixes: Header: header 1.2.1, parent headers: ['header 1', 'header 1.2'] Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] - But the actual output is: + But instead we actually got: .. code:: text Header: header 1, parent headers: [] From a2c7820cfe4a0e35a63d7a3eb1fa0a8a53583c06 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:49:22 +0200 Subject: [PATCH 09/13] Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- ...er-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml index dd12591ae2..c2787ed5a5 100644 --- a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -55,10 +55,7 @@ fixes: Header: header 1.2.1, parent headers: ['header 1'] Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] - The error happens when a parent header has its own content chunk before the first - child. In that path the code clears active_parents, so the first child loses its - ancestor while later siblings inherit it correctly. + The error happened when a parent header had its own content chunk before the first + child header. - The fix is to derive parent_headers from the current header_stack instead of - carrying a separate mutable “active parents” list that gets out of sync after contentful - headers. + This has been fixed so even when a parent header has its own content chunk before the first child header all content is preserved. From 0fe348179f51015d20ea27b68f91c9b729759897 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:50:06 +0200 Subject: [PATCH 10/13] Update test/components/preprocessors/test_markdown_header_splitter.py removed the "secondary_split="word"" argument as error happens regardless of if secondary split is present Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- test/components/preprocessors/test_markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 58209aaa74..4bd4ec625c 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -108,7 +108,7 @@ def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_ch "### Header 1.2.2\n" "Text 4\n" ) - splitter = MarkdownHeaderSplitter(keep_headers=True, secondary_split="word") + splitter = MarkdownHeaderSplitter(keep_headers=True) split_docs = splitter.run(documents=[Document(content=text)])["documents"] assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [ From a6eb3f0e843b6decd498ba11427157e701019643 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:50:37 +0200 Subject: [PATCH 11/13] Update test/components/preprocessors/test_markdown_header_splitter.py broke down text in unit test for readability Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../preprocessors/test_markdown_header_splitter.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 4bd4ec625c..5dbe80a32b 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -94,17 +94,13 @@ def test_basic_split(sample_text): def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child(): text = ( "# Header 1\n" - "Intro text\n" - "\n" + "Intro text\n\n" "## Header 1.1\n" - "Text 1\n" - "\n" + "Text 1\n\n" "## Header 1.2\n" - "Text 2\n" - "\n" + "Text 2\n\n" "### Header 1.2.1\n" - "Text 3\n" - "\n" + "Text 3\n\n" "### Header 1.2.2\n" "Text 4\n" ) From 58fcad19ae2f3dfc37c6409ee4aee254da1d6073 Mon Sep 17 00:00:00 2001 From: Nhat Huy Vu <150295143+MechaCritter@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:51:14 +0200 Subject: [PATCH 12/13] Update test/components/preprocessors/test_markdown_header_splitter.py added sanity checking (reconstructed text is equal original text) Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- test/components/preprocessors/test_markdown_header_splitter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 5dbe80a32b..38b447dab2 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -114,6 +114,9 @@ def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_ch ("Header 1.2.1", ["Header 1", "Header 1.2"]), ("Header 1.2.2", ["Header 1", "Header 1.2"]), ] + # reconstruct original text + reconstructed_text = "".join(doc.content for doc in split_docs) + assert reconstructed_text == text def test_split_without_headers(sample_text): From c364f5cc026518dabfad2e141fc4e9feac163b5c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:55:30 +0200 Subject: [PATCH 13/13] Update test/components/preprocessors/test_markdown_header_splitter.py --- test/components/preprocessors/test_markdown_header_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index 38b447dab2..cdbb4989e3 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -91,7 +91,7 @@ def test_basic_split(sample_text): assert reconstructed_doc == sample_text -def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child(): +def test_keep_headers_preserves_parent_headers_for_first_child(): text = ( "# Header 1\n" "Intro text\n\n"