fixed bug where MarkdownHeaderSplitter's split result missed the first direct parent's header in the metadata and added lark to pyproject.toml (#11042)

MechaCritter · sjrl · web-flow · commit a545c6af9d05 · 2026-04-09T16:53:31.000+02:00
* fixed bug where MarkdownHeaderSplitter's split result missed the first direct parent's header in the metadata and added lark to pyproject.toml

* added release note

* reverted changes of adding "lark"

* collapsed release note to contain only "fixes"

* added test "test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child" to prove my concept.

* Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml

written description in past tense

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml

written description in past tense

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml

written description in past tense

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update test/components/preprocessors/test_markdown_header_splitter.py

removed the "secondary_split="word"" argument as error happens regardless of if secondary split is present

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update test/components/preprocessors/test_markdown_header_splitter.py

broke down text in unit test for readability

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update test/components/preprocessors/test_markdown_header_splitter.py

added sanity checking (reconstructed text is equal original text)

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;

* Update test/components/preprocessors/test_markdown_header_splitter.py

---------

Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;
diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -147,7 +147,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
         # process headers and build chunks
         chunks: list[dict] = []
         header_stack: list[str | None] = [None] * 6
-        active_parents: list[str] = []  # track active parent headers
         pending_headers: list[str] = []  # store empty headers to prepend to next content
         has_content = False  # flag to track if any header has content
 
@@ -169,16 +168,15 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
 
             # skip splits w/o content
             if not content.strip():  # this strip is needed to avoid counting whitespace as content
-                # add as parent for subsequent headers
-                active_parents = [h for h in header_stack[: level - 1] if h is not None]
-                active_parents.append(header_text)
                 if self.keep_headers:
                     header_line = f"{header_prefix} {header_text}"
                     pending_headers.append(header_line)
                 continue
 
             has_content = True  # at least one header has content
-            parent_headers = list(active_parents)
+            # Build parent metadata from the current header stack so the first child of a
+            # contentful section still inherits its full ancestor chain.
+            parent_headers = [h for h in header_stack[: level - 1] if h is not None]
 
             logger.debug(
                 "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
@@ -198,9 +196,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
             else:
                 chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}})
 
-            # reset active parents
-            active_parents = [h for h in header_stack[: level - 1] if h is not None]
-
         # return doc unchunked if no headers have content
         if not has_content:
             logger.info(
diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml
@@ -0,0 +1,61 @@
+---
+fixes:
+  - |
+    When using the **MarkdownHeaderSplitter**, in the split chunks, the child header previously lost
+    its direct parent header in the metadata. Previously if one executed the code below:
+
+    .. code:: python
+      from haystack.components.preprocessors import MarkdownHeaderSplitter
+      from haystack import Document
+
+      text = """
+      # header 1
+      intro text
+
+      ## header 1.1
+      text 1
+
+      ## header 1.2
+      text 2
+
+      ### header 1.2.1
+      text 3
+
+      ### header 1.2.2
+      text 4
+      """
+
+      document = Document(content=text)
+
+      splitter = MarkdownHeaderSplitter(
+              keep_headers=True,
+              secondary_split="word"
+      )
+      result = splitter.run(documents=[document])["documents"]
+
+      for doc in result:
+          print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}")
+
+    We would have expected this output:
+
+    .. code:: text
+
+      Header: header 1, parent headers: []
+      Header: header 1.1, parent headers: ['header 1']
+      Header: header 1.2, parent headers: ['header 1']
+      Header: header 1.2.1, parent headers: ['header 1', 'header 1.2']
+      Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']
+
+    But instead we actually got:
+
+    .. code:: text
+      Header: header 1, parent headers: []
+      Header: header 1.1, parent headers: []
+      Header: header 1.2, parent headers: ['header 1']
+      Header: header 1.2.1, parent headers: ['header 1']
+      Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']
+
+    The error happened when a parent header had its own content chunk before the first
+    child header.
+
+    This has been fixed so even when a parent header has its own content chunk before the first child header all content is preserved.
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -91,6 +91,34 @@ def test_basic_split(sample_text):
     assert reconstructed_doc == sample_text
 
 
+def test_keep_headers_preserves_parent_headers_for_first_child():
+    text = (
+        "# Header 1\n"
+        "Intro text\n\n"
+        "## Header 1.1\n"
+        "Text 1\n\n"
+        "## Header 1.2\n"
+        "Text 2\n\n"
+        "### Header 1.2.1\n"
+        "Text 3\n\n"
+        "### Header 1.2.2\n"
+        "Text 4\n"
+    )
+    splitter = MarkdownHeaderSplitter(keep_headers=True)
+    split_docs = splitter.run(documents=[Document(content=text)])["documents"]
+
+    assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [
+        ("Header 1", []),
+        ("Header 1.1", ["Header 1"]),
+        ("Header 1.2", ["Header 1"]),
+        ("Header 1.2.1", ["Header 1", "Header 1.2"]),
+        ("Header 1.2.2", ["Header 1", "Header 1.2"]),
+    ]
+    # reconstruct original text
+    reconstructed_text = "".join(doc.content for doc in split_docs)
+    assert reconstructed_text == text
+
+
 def test_split_without_headers(sample_text):
     splitter = MarkdownHeaderSplitter(keep_headers=False)
     docs = [Document(content=sample_text)]