Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
6d35380
fixed bug where MarkdownHeaderSplitter's split result missed the firs…
MechaCritter Apr 6, 2026
ced2666
added release note
MechaCritter Apr 6, 2026
0fdea30
Merge branch 'main' into bugfix/markdown_header_splitter
MechaCritter Apr 7, 2026
d78d837
reverted changes of adding "lark"
MechaCritter Apr 9, 2026
9f43069
collapsed release note to contain only "fixes"
MechaCritter Apr 9, 2026
dae77ef
added test "test_keep_headers_with_secondary_split_preserves_parent_h…
MechaCritter Apr 9, 2026
8f362ae
Merge remote-tracking branch 'refs/remotes/origin/bugfix/markdown_hea…
MechaCritter Apr 9, 2026
be4dd40
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
b50296a
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
450b69f
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
a2c7820
Update releasenotes/notes/fix-missing-parent-header-error-MarkdownHea…
MechaCritter Apr 9, 2026
0fe3481
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
a6eb3f0
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
58fcad1
Update test/components/preprocessors/test_markdown_header_splitter.py
MechaCritter Apr 9, 2026
c364f5c
Update test/components/preprocessors/test_markdown_header_splitter.py
sjrl Apr 9, 2026
6de41d9
Merge branch 'main' into bugfix/markdown_header_splitter
MechaCritter Apr 9, 2026
5c85f3d
Merge branch 'main' into bugfix/markdown_header_splitter
sjrl Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions haystack/components/preprocessors/markdown_header_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
# process headers and build chunks
chunks: list[dict] = []
header_stack: list[str | None] = [None] * 6
active_parents: list[str] = [] # track active parent headers
pending_headers: list[str] = [] # store empty headers to prepend to next content
has_content = False # flag to track if any header has content

Expand All @@ -169,16 +168,15 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:

# skip splits w/o content
if not content.strip(): # this strip is needed to avoid counting whitespace as content
# add as parent for subsequent headers
active_parents = [h for h in header_stack[: level - 1] if h is not None]
active_parents.append(header_text)
if self.keep_headers:
header_line = f"{header_prefix} {header_text}"
pending_headers.append(header_line)
continue

has_content = True # at least one header has content
parent_headers = list(active_parents)
# Build parent metadata from the current header stack so the first child of a
# contentful section still inherits its full ancestor chain.
parent_headers = [h for h in header_stack[: level - 1] if h is not None]

logger.debug(
"Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level
Expand All @@ -198,9 +196,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]:
else:
chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}})

# reset active parents
active_parents = [h for h in header_stack[: level - 1] if h is not None]

# return doc unchunked if no headers have content
if not has_content:
logger.info(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
fixes:
- |
When using the **MarkdownHeaderSplitter**, in the split chunks, the child header previously lost
its direct parent header in the metadata. Previously if one executed the code below:
.. code:: python
from haystack.components.preprocessors import MarkdownHeaderSplitter
from haystack import Document
text = """
# header 1
intro text
## header 1.1
text 1
## header 1.2
text 2
### header 1.2.1
text 3
### header 1.2.2
text 4
"""
document = Document(content=text)
splitter = MarkdownHeaderSplitter(
keep_headers=True,
secondary_split="word"
)
result = splitter.run(documents=[document])["documents"]
for doc in result:
print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}")
We would have expected this output:
.. code:: text
Header: header 1, parent headers: []
Header: header 1.1, parent headers: ['header 1']
Header: header 1.2, parent headers: ['header 1']
Header: header 1.2.1, parent headers: ['header 1', 'header 1.2']
Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']
But instead we actually got:
.. code:: text
Header: header 1, parent headers: []
Header: header 1.1, parent headers: []
Header: header 1.2, parent headers: ['header 1']
Header: header 1.2.1, parent headers: ['header 1']
Header: header 1.2.2, parent headers: ['header 1', 'header 1.2']
The error happened when a parent header had its own content chunk before the first
child header.
This has been fixed so even when a parent header has its own content chunk before the first child header all content is preserved.
28 changes: 28 additions & 0 deletions test/components/preprocessors/test_markdown_header_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,34 @@ def test_basic_split(sample_text):
assert reconstructed_doc == sample_text


def test_keep_headers_preserves_parent_headers_for_first_child():
text = (
"# Header 1\n"
"Intro text\n\n"
"## Header 1.1\n"
"Text 1\n\n"
"## Header 1.2\n"
"Text 2\n\n"
"### Header 1.2.1\n"
"Text 3\n\n"
"### Header 1.2.2\n"
"Text 4\n"
)
splitter = MarkdownHeaderSplitter(keep_headers=True)
split_docs = splitter.run(documents=[Document(content=text)])["documents"]

assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [
("Header 1", []),
("Header 1.1", ["Header 1"]),
("Header 1.2", ["Header 1"]),
("Header 1.2.1", ["Header 1", "Header 1.2"]),
("Header 1.2.2", ["Header 1", "Header 1.2"]),
]
# reconstruct original text
reconstructed_text = "".join(doc.content for doc in split_docs)
assert reconstructed_text == text


def test_split_without_headers(sample_text):
splitter = MarkdownHeaderSplitter(keep_headers=False)
docs = [Document(content=sample_text)]
Expand Down
Loading