From c69396227f6ff5da45a213bd674e61c390c4a9ab Mon Sep 17 00:00:00 2001 From: deepakdevp Date: Thu, 26 Mar 2026 20:44:24 +0900 Subject: [PATCH] fix(parse): prevent merged filename collision on duplicate headings _generate_merged_filename() now always appends a content-based hash suffix derived from all section names and indices. This prevents filename collisions when multiple merge groups share the same first heading name and count, which previously caused file overwrites and content loss during markdown resource ingestion. Fixes #1004. --- openviking/parse/parsers/markdown.py | 15 ++++--- .../parse/test_markdown_filename_collision.py | 44 +++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 tests/parse/test_markdown_filename_collision.py diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index abeb542423..7ded6c1586 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -601,17 +601,22 @@ def _generate_merged_filename(self, sections: List[Tuple[str, str, int]]) -> str count = len(names) max_len = self.MAX_MERGED_FILENAME_LENGTH + # Build a content-aware hash from ALL section names AND indices to guarantee + # uniqueness even when different merge groups share the same heading names. + full_key = "_".join(f"{n}:{i}" for n, _, i in sections) + hash_suffix = hashlib.sha256(full_key.encode()).hexdigest()[:8] + if count == 1: - name = names[0] + base = names[0] else: suffix = f"_{count}more" - max_first_len = max_len - len(suffix) + max_first_len = max_len - len(suffix) - 9 # reserve space for _hash first_name = names[0][: max(max_first_len, 1)] - name = f"{first_name}{suffix}" + base = f"{first_name}{suffix}" + + name = f"{base}_{hash_suffix}" if len(name) > max_len: - full_key = "_".join(names) - hash_suffix = hashlib.sha256(full_key.encode()).hexdigest()[:8] name = f"{name[: max_len - 9]}_{hash_suffix}" name = name.strip("_") diff --git a/tests/parse/test_markdown_filename_collision.py b/tests/parse/test_markdown_filename_collision.py new file mode 100644 index 0000000000..a88651b1f6 --- /dev/null +++ b/tests/parse/test_markdown_filename_collision.py @@ -0,0 +1,44 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for _generate_merged_filename uniqueness when headings collide.""" + +from openviking.parse.parsers.markdown import MarkdownParser +from openviking_cli.utils.config.parser_config import ParserConfig + + +class TestGenerateMergedFilenameCollision: + def _make_parser(self) -> MarkdownParser: + return MarkdownParser(ParserConfig()) + + def test_duplicate_heading_produces_unique_filenames(self): + """Merge groups with same first heading but different content must get unique filenames.""" + parser = self._make_parser() + + group1 = [ + ("Our Culture", "content about values", 1), + ("Our Culture", "content about mission", 2), + ] + group2 = [ + ("Our Culture", "content about team", 3), + ("Our Culture", "content about vision", 4), + ] + + name1 = parser._generate_merged_filename(group1) + name2 = parser._generate_merged_filename(group2) + + assert name1 != name2, f"Filenames must be unique but both are '{name1}'" + assert "Our" in name1 or "Culture" in name1 # Still human-readable + + def test_single_section_filename_still_works(self): + """Single section should still produce a readable filename.""" + parser = self._make_parser() + + sections = [("Introduction", "some content", 1)] + name = parser._generate_merged_filename(sections) + + assert "Introduction" in name + + def test_empty_sections_returns_merged(self): + """Empty sections list should return 'merged'.""" + parser = self._make_parser() + assert parser._generate_merged_filename([]) == "merged"