Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
6d35380
fixed bug where MarkdownHeaderSplitter's split result missed the firs…
MechaCritter Apr 6, 2026
ced2666
added release note
MechaCritter Apr 6, 2026
0fdea30
Merge branch 'main' into bugfix/markdown_header_splitter
MechaCritter Apr 7, 2026
d78d837
reverted changes of adding "lark"
MechaCritter Apr 9, 2026
9f43069
collapsed release note to contain only "fixes"
MechaCritter Apr 9, 2026
dae77ef
added test "test_keep_headers_with_secondary_split_preserves_parent_h…
MechaCritter Apr 9, 2026
8f362ae
Merge remote-tracking branch 'refs/remotes/origin/bugfix/markdown_hea…
MechaCritter Apr 9, 2026
be4dd40
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
b50296a
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
450b69f
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
a2c7820
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
0fe3481
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
a6eb3f0
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
58fcad1
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
c364f5c
Update test/components/preprocessors/test_markdown_header_splitter.py
sjrl Apr 9, 2026
6de41d9
Merge branch 'main' into bugfix/markdown_header_splitter
MechaCritter Apr 9, 2026
5c85f3d
Merge branch 'main' into bugfix/markdown_header_splitter
sjrl Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions haystack/components/preprocessors/markdown_header_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
# process headers and build chunks
chunks: list[dict] = []
header_stack: list[str | None] = [None] * 6
active_parents: list[str] = [] # track active parent headers
pending_headers: list[str] = [] # store empty headers to prepend to next content
has_content = False # flag to track if any header has content

Expand All @@ -169,16 +168,15 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:

# skip splits w/o content
if not content.strip(): # this strip is needed to avoid counting whitespace as content
# add as parent for subsequent headers
active_parents = [h for h in header_stack[: level - 1] if h is not None]
active_parents.append(header_text)
if self.keep_headers:
header_line = f"{header_prefix} {header_text}"
pending_headers.append(header_line)
continue

has_content = True # at least one header has content
parent_headers = list(active_parents)
# Build parent metadata from the current header stack so the first child of a
# contentful section still inherits its full ancestor chain.
parent_headers = [h for h in header_stack[: level - 1] if h is not None]

logger.debug(
"Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
Expand All @@ -198,9 +196,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
else:
chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}})

# reset active parents
active_parents = [h for h in header_stack[: level - 1] if h is not None]

# return doc unchunked if no headers have content
if not has_content:
logger.info(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
fixes:
- |
When using the **MarkdownHeaderSplitter**, in the split chunks, the child header always loses
its direct parent header in the metadata. One can execute the code below:
Comment thread
MechaCritter marked this conversation as resolved.
Outdated

.. code:: python
from haystack.components.preprocessors import MarkdownHeaderSplitter
from haystack import Document

text = """
# header 1
intro text

## header 1.1
text 1

## header 1.2
text 2

### header 1.2.1
text 3

### header 1.2.2
text 4
"""

document = Document(content=text)

splitter = MarkdownHeaderSplitter(
keep_headers=True,
secondary_split="word"
)
result = splitter.run(documents=[document])["documents"]

for doc in result:
print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}")

This output is expected:
Comment thread
MechaCritter marked this conversation as resolved.
Outdated

.. code:: text

Header: header 1, parent headers: []
Header: header 1.1, parent headers: ['header 1']
Header: header 1.2, parent headers: ['header 1']
Header: header 1.2.1, parent headers: ['header 1', 'header 1.2']
Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']

But the actual output is:
Comment thread
MechaCritter marked this conversation as resolved.
Outdated

.. code:: text
Header: header 1, parent headers: []
Header: header 1.1, parent headers: []
Header: header 1.2, parent headers: ['header 1']
Header: header 1.2.1, parent headers: ['header 1']
Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']

The error happens when a parent header has its own content chunk before the first
child. In that path the code clears active_parents, so the first child loses its
ancestor while later siblings inherit it correctly.

The fix is to derive parent_headers from the current header_stack instead of
carrying a separate mutable “active parents” list that gets out of sync after contentful
headers.
Comment thread
MechaCritter marked this conversation as resolved.
Outdated
29 changes: 29 additions & 0 deletions test/components/preprocessors/test_markdown_header_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,35 @@ def test_basic_split(sample_text):
assert reconstructed_doc == sample_text


def test_keep_headers_with_secondary_split_preserves_parent_headers_for_first_child():
Comment thread
sjrl marked this conversation as resolved.
Outdated
text = (
"# Header 1\n"
"Intro text\n"
"\n"
"## Header 1.1\n"
"Text 1\n"
"\n"
"## Header 1.2\n"
"Text 2\n"
"\n"
"### Header 1.2.1\n"
"Text 3\n"
"\n"
"### Header 1.2.2\n"
"Text 4\n"
)
Comment thread
MechaCritter marked this conversation as resolved.
splitter = MarkdownHeaderSplitter(keep_headers=True, secondary_split="word")
Comment thread
MechaCritter marked this conversation as resolved.
Outdated
split_docs = splitter.run(documents=[Document(content=text)])["documents"]

assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [
("Header 1", []),
("Header 1.1", ["Header 1"]),
("Header 1.2", ["Header 1"]),
("Header 1.2.1", ["Header 1", "Header 1.2"]),
("Header 1.2.2", ["Header 1", "Header 1.2"]),
]
Comment thread
MechaCritter marked this conversation as resolved.


def test_split_without_headers(sample_text):
splitter = MarkdownHeaderSplitter(keep_headers=False)
docs = [Document(content=sample_text)]
Expand Down