diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py index 21311504e6..890fc1f279 100644 --- a/haystack/components/preprocessors/markdown_header_splitter.py +++ b/haystack/components/preprocessors/markdown_header_splitter.py @@ -147,7 +147,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # process headers and build chunks chunks: list[dict] = [] header_stack: list[str | None] = [None] * 6 - active_parents: list[str] = [] # track active parent headers pending_headers: list[str] = [] # store empty headers to prepend to next content has_content = False # flag to track if any header has content @@ -169,16 +168,15 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: # skip splits w/o content if not content.strip(): # this strip is needed to avoid counting whitespace as content - # add as parent for subsequent headers - active_parents = [h for h in header_stack[: level - 1] if h is not None] - active_parents.append(header_text) if self.keep_headers: header_line = f"{header_prefix} {header_text}" pending_headers.append(header_line) continue has_content = True # at least one header has content - parent_headers = list(active_parents) + # Build parent metadata from the current header stack so the first child of a + # contentful section still inherits its full ancestor chain. + parent_headers = [h for h in header_stack[: level - 1] if h is not None] logger.debug( "Creating chunk for header '{header_text}' at level {level}", header_text=header_text, level=level @@ -198,9 +196,6 @@ def _split_text_by_markdown_headers(self, text: str, doc_id: str) -> list[dict]: else: chunks.append({"content": content, "meta": {"header": header_text, "parent_headers": parent_headers}}) - # reset active parents - active_parents = [h for h in header_stack[: level - 1] if h is not None] - # return doc unchunked if no headers have content if not has_content: logger.info( diff --git a/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml new file mode 100644 index 0000000000..c2787ed5a5 --- /dev/null +++ b/releasenotes/notes/fix-missing-parent-header-error-MarkdownHeaderSplitter-b5db96e19011b6b9.yaml @@ -0,0 +1,61 @@ +--- +fixes: + - | + When using the **MarkdownHeaderSplitter**, in the split chunks, the child header previously lost + its direct parent header in the metadata. Previously if one executed the code below: + + .. code:: python + from haystack.components.preprocessors import MarkdownHeaderSplitter + from haystack import Document + + text = """ + # header 1 + intro text + + ## header 1.1 + text 1 + + ## header 1.2 + text 2 + + ### header 1.2.1 + text 3 + + ### header 1.2.2 + text 4 + """ + + document = Document(content=text) + + splitter = MarkdownHeaderSplitter( + keep_headers=True, + secondary_split="word" + ) + result = splitter.run(documents=[document])["documents"] + + for doc in result: + print(f"Header: {doc.meta['header']}, parent headers: {doc.meta['parent_headers']}") + + We would have expected this output: + + .. code:: text + + Header: header 1, parent headers: [] + Header: header 1.1, parent headers: ['header 1'] + Header: header 1.2, parent headers: ['header 1'] + Header: header 1.2.1, parent headers: ['header 1', 'header 1.2'] + Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] + + But instead we actually got: + + .. code:: text + Header: header 1, parent headers: [] + Header: header 1.1, parent headers: [] + Header: header 1.2, parent headers: ['header 1'] + Header: header 1.2.1, parent headers: ['header 1'] + Header: header 1.2.2, parent headers: ['header 1', 'header 1.2'] + + The error happened when a parent header had its own content chunk before the first + child header. + + This has been fixed so even when a parent header has its own content chunk before the first child header all content is preserved. diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py index a8bfbaf976..cdbb4989e3 100644 --- a/test/components/preprocessors/test_markdown_header_splitter.py +++ b/test/components/preprocessors/test_markdown_header_splitter.py @@ -91,6 +91,34 @@ def test_basic_split(sample_text): assert reconstructed_doc == sample_text +def test_keep_headers_preserves_parent_headers_for_first_child(): + text = ( + "# Header 1\n" + "Intro text\n\n" + "## Header 1.1\n" + "Text 1\n\n" + "## Header 1.2\n" + "Text 2\n\n" + "### Header 1.2.1\n" + "Text 3\n\n" + "### Header 1.2.2\n" + "Text 4\n" + ) + splitter = MarkdownHeaderSplitter(keep_headers=True) + split_docs = splitter.run(documents=[Document(content=text)])["documents"] + + assert [(doc.meta["header"], doc.meta["parent_headers"]) for doc in split_docs] == [ + ("Header 1", []), + ("Header 1.1", ["Header 1"]), + ("Header 1.2", ["Header 1"]), + ("Header 1.2.1", ["Header 1", "Header 1.2"]), + ("Header 1.2.2", ["Header 1", "Header 1.2"]), + ] + # reconstruct original text + reconstructed_text = "".join(doc.content for doc in split_docs) + assert reconstructed_text == text + + def test_split_without_headers(sample_text): splitter = MarkdownHeaderSplitter(keep_headers=False) docs = [Document(content=sample_text)]