diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index abeb542423..7ded6c1586 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -601,17 +601,22 @@ def _generate_merged_filename(self, sections: List[Tuple[str, str, int]]) -> str count = len(names) max_len = self.MAX_MERGED_FILENAME_LENGTH + # Build a content-aware hash from ALL section names AND indices to guarantee + # uniqueness even when different merge groups share the same heading names. + full_key = "_".join(f"{n}:{i}" for n, _, i in sections) + hash_suffix = hashlib.sha256(full_key.encode()).hexdigest()[:8] + if count == 1: - name = names[0] + base = names[0] else: suffix = f"_{count}more" - max_first_len = max_len - len(suffix) + max_first_len = max_len - len(suffix) - 9 # reserve space for _hash first_name = names[0][: max(max_first_len, 1)] - name = f"{first_name}{suffix}" + base = f"{first_name}{suffix}" + + name = f"{base}_{hash_suffix}" if len(name) > max_len: - full_key = "_".join(names) - hash_suffix = hashlib.sha256(full_key.encode()).hexdigest()[:8] name = f"{name[: max_len - 9]}_{hash_suffix}" name = name.strip("_") diff --git a/tests/parse/test_markdown_filename_collision.py b/tests/parse/test_markdown_filename_collision.py new file mode 100644 index 0000000000..a88651b1f6 --- /dev/null +++ b/tests/parse/test_markdown_filename_collision.py @@ -0,0 +1,44 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for _generate_merged_filename uniqueness when headings collide.""" + +from openviking.parse.parsers.markdown import MarkdownParser +from openviking_cli.utils.config.parser_config import ParserConfig + + +class TestGenerateMergedFilenameCollision: + def _make_parser(self) -> MarkdownParser: + return MarkdownParser(ParserConfig()) + + def test_duplicate_heading_produces_unique_filenames(self): + """Merge groups with same first heading but different content must get unique filenames.""" + parser = self._make_parser() + + group1 = [ + ("Our Culture", "content about values", 1), + ("Our Culture", "content about mission", 2), + ] + group2 = [ + ("Our Culture", "content about team", 3), + ("Our Culture", "content about vision", 4), + ] + + name1 = parser._generate_merged_filename(group1) + name2 = parser._generate_merged_filename(group2) + + assert name1 != name2, f"Filenames must be unique but both are '{name1}'" + assert "Our" in name1 or "Culture" in name1 # Still human-readable + + def test_single_section_filename_still_works(self): + """Single section should still produce a readable filename.""" + parser = self._make_parser() + + sections = [("Introduction", "some content", 1)] + name = parser._generate_merged_filename(sections) + + assert "Introduction" in name + + def test_empty_sections_returns_merged(self): + """Empty sections list should return 'merged'.""" + parser = self._make_parser() + assert parser._generate_merged_filename([]) == "merged"