Skip to content

Commit d909bae

Browse files
authored
fix(semantic): preserve repository hierarchy in overviews (#1376)
1 parent b153ad9 commit d909bae

File tree

2 files changed

+101
-36
lines changed

2 files changed

+101
-36
lines changed

openviking/prompts/templates/semantic/overview_generation.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ template: |
4545
[Subdirectories and Their Summaries]
4646
{{ children_abstracts }}
4747
48+
Relationship rules:
49+
- Treat child directories as parts of the same repository unless the summaries clearly show they are independent projects.
50+
- Do not describe every child directory as an independent project by default.
51+
- When the summaries suggest a code repository, explain how subdirectories relate to the whole repo, such as services, libraries, apps, modules, or support folders.
52+
4853
Output in Markdown format, strictly following this structure:
4954
5055
1. **Title** (H1): Directory name

tests/storage/test_semantic_processor_language.py

Lines changed: 96 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,17 @@ def test_detect_language_mixed_chinese_english(self):
6060
class TestLanguageFlow:
6161
"""语言检测 + 模板渲染流程测试。"""
6262

63-
@pytest.mark.parametrize("lang,content,file_name", [
64-
("zh-CN", "这是一个中文Python文件,包含测试代码", "chinese_code.py"),
65-
("en", "This is an English Python file for testing", "english_code.py"),
66-
("ja", "これは日本語のPythonコードテストファイルです", "japanese_code.py"),
67-
("ko", "이것은 한국어 Python 코드 테스트 파일입니다", "korean_code.py"),
68-
("ru", "Это русский тестовый файл Python кода", "russian_code.py"),
69-
("ar", "هذا ملف اختبار كود بايثون عربي", "arabic_code.py"),
70-
])
63+
@pytest.mark.parametrize(
64+
"lang,content,file_name",
65+
[
66+
("zh-CN", "这是一个中文Python文件,包含测试代码", "chinese_code.py"),
67+
("en", "This is an English Python file for testing", "english_code.py"),
68+
("ja", "これは日本語のPythonコードテストファイルです", "japanese_code.py"),
69+
("ko", "이것은 한국어 Python 코드 테스트 파일입니다", "korean_code.py"),
70+
("ru", "Это русский тестовый файл Python кода", "russian_code.py"),
71+
("ar", "هذا ملف اختبار كود بايثون عربي", "arabic_code.py"),
72+
],
73+
)
7174
def test_language_detection_to_template_flow(self, lang, content, file_name):
7275
"""语言检测 -> output_language 注入模板 -> prompt 包含语言指令"""
7376
detected_lang = _detect_language_from_text(content, fallback_language="en")
@@ -83,11 +86,14 @@ def test_language_detection_to_template_flow(self, lang, content, file_name):
8386
class TestOverviewGenerationFlow:
8487
"""目录概述生成流程测试。"""
8588

86-
@pytest.mark.parametrize("lang,file_summaries", [
87-
("zh-CN", "[1] file1.py: 这是一个Python文件\n[2] file2.py: 这是另一个文件"),
88-
("en", "[1] file1.py: This is a Python file\n[2] file2.py: Another file"),
89-
("ja", "[1] file1.py: それはPythonファイルです\n[2] file2.py: これもPython"),
90-
])
89+
@pytest.mark.parametrize(
90+
"lang,file_summaries",
91+
[
92+
("zh-CN", "[1] file1.py: 这是一个Python文件\n[2] file2.py: 这是另一个文件"),
93+
("en", "[1] file1.py: This is a Python file\n[2] file2.py: Another file"),
94+
("ja", "[1] file1.py: それはPythonファイルです\n[2] file2.py: これもPython"),
95+
],
96+
)
9197
def test_overview_generation_language_flow(self, lang, file_summaries):
9298
"""目录摘要 -> 语言检测 -> overview 模板"""
9399
detected_lang = _detect_language_from_text(file_summaries, fallback_language="en")
@@ -104,6 +110,31 @@ def test_overview_generation_language_flow(self, lang, file_summaries):
104110
)
105111
assert f"Output Language: {lang}" in prompt
106112

113+
def test_overview_generation_prompt_preserves_repository_hierarchy(self):
114+
prompt = render_prompt(
115+
"semantic.overview_generation",
116+
{
117+
"dir_name": "repo-root",
118+
"file_summaries": "[1] pyproject.toml: Python project config",
119+
"children_abstracts": "- backend/: API service\n- frontend/: web UI",
120+
"output_language": "en",
121+
},
122+
)
123+
124+
assert "Relationship rules:" in prompt
125+
assert (
126+
"- Treat child directories as parts of the same repository unless the summaries clearly show they are independent projects."
127+
in prompt
128+
)
129+
assert (
130+
"- Do not describe every child directory as an independent project by default."
131+
in prompt
132+
)
133+
assert (
134+
"- When the summaries suggest a code repository, explain how subdirectories relate to the whole repo, such as services, libraries, apps, modules, or support folders."
135+
in prompt
136+
)
137+
107138

108139
class LanguageAwareMockVLM:
109140
"""语言感知的 MockVLM,根据 prompt 中的 Output Language 返回对应语言的响应。"""
@@ -161,19 +192,27 @@ def temp_multilang_files(self):
161192
files["chinese_py"].write_text("# 中文Python文件\ndef 你好():\n print('你好世界')\n")
162193

163194
files["english_py"] = tmppath / "english_code.py"
164-
files["english_py"].write_text("# English Python file\ndef hello():\n print('Hello World')\n")
195+
files["english_py"].write_text(
196+
"# English Python file\ndef hello():\n print('Hello World')\n"
197+
)
165198

166199
files["japanese_py"] = tmppath / "japanese_code.py"
167-
files["japanese_py"].write_text("# 日本語Pythonファイル\ndef こんにちは():\n print('こんにちは世界')\n")
200+
files["japanese_py"].write_text(
201+
"# 日本語Pythonファイル\ndef こんにちは():\n print('こんにちは世界')\n"
202+
)
168203

169204
files["korean_py"] = tmppath / "korean_code.py"
170-
files["korean_py"].write_text("# 한국어 Python 파일\ndef 안녕하세요():\n print('안녕하세요')\n")
205+
files["korean_py"].write_text(
206+
"# 한국어 Python 파일\ndef 안녕하세요():\n print('안녕하세요')\n"
207+
)
171208

172209
files["chinese_md"] = tmppath / "chinese_doc.md"
173210
files["chinese_md"].write_text("# 中文文档\n\n这是一个测试文档,包含中文技术内容。\n")
174211

175212
files["english_md"] = tmppath / "english_doc.md"
176-
files["english_md"].write_text("# English Documentation\n\nThis is a test document with English content.\n")
213+
files["english_md"].write_text(
214+
"# English Documentation\n\nThis is a test document with English content.\n"
215+
)
177216

178217
yield files
179218

@@ -191,14 +230,17 @@ def _create_mock_config(self, mock_vlm: LanguageAwareMockVLM) -> MagicMock:
191230
return mock_config
192231

193232
@pytest.mark.asyncio
194-
@pytest.mark.parametrize("file_key,file_name,expected_lang", [
195-
("chinese_py", "chinese_code.py", "zh-CN"),
196-
("english_py", "english_code.py", "en"),
197-
("japanese_py", "japanese_code.py", "ja"),
198-
("korean_py", "korean_code.py", "ko"),
199-
("chinese_md", "chinese_doc.md", "zh-CN"),
200-
("english_md", "english_doc.md", "en"),
201-
])
233+
@pytest.mark.parametrize(
234+
"file_key,file_name,expected_lang",
235+
[
236+
("chinese_py", "chinese_code.py", "zh-CN"),
237+
("english_py", "english_code.py", "en"),
238+
("japanese_py", "japanese_code.py", "ja"),
239+
("korean_py", "korean_code.py", "ko"),
240+
("chinese_md", "chinese_doc.md", "zh-CN"),
241+
("english_md", "english_doc.md", "en"),
242+
],
243+
)
202244
async def test_e2e_code_output_language(
203245
self, temp_multilang_files, file_key, file_name, expected_lang
204246
):
@@ -210,8 +252,14 @@ async def test_e2e_code_output_language(
210252
mock_viking_fs = self._create_mock_viking_fs(content)
211253
mock_config = self._create_mock_config(mock_vlm)
212254

213-
with patch("openviking.storage.queuefs.semantic_processor.get_viking_fs", return_value=mock_viking_fs):
214-
with patch("openviking.storage.queuefs.semantic_processor.get_openviking_config", return_value=mock_config):
255+
with patch(
256+
"openviking.storage.queuefs.semantic_processor.get_viking_fs",
257+
return_value=mock_viking_fs,
258+
):
259+
with patch(
260+
"openviking.storage.queuefs.semantic_processor.get_openviking_config",
261+
return_value=mock_config,
262+
):
215263
processor = SemanticProcessor()
216264
processor._current_ctx = MagicMock()
217265

@@ -222,17 +270,22 @@ async def test_e2e_code_output_language(
222270
)
223271

224272
prompt_sent = mock_vlm.prompts_received[0]
225-
assert f"Output Language: {expected_lang}" in prompt_sent, \
273+
assert f"Output Language: {expected_lang}" in prompt_sent, (
226274
f"{file_name}: Prompt missing Output Language: {expected_lang}"
275+
)
227276

228-
assert _verify_content_language(result["summary"], expected_lang), \
277+
assert _verify_content_language(result["summary"], expected_lang), (
229278
f"{file_name}: Content language mismatch. Expected {expected_lang}, got: {result['summary']}"
279+
)
230280

231281
@pytest.mark.asyncio
232-
@pytest.mark.parametrize("content,file_name,expected_lang", [
233-
("Это русский тестовый файл Python", "russian_code.py", "ru"),
234-
("هذا ملف اختبار كود بايثون عربي", "arabic_code.py", "ar"),
235-
])
282+
@pytest.mark.parametrize(
283+
"content,file_name,expected_lang",
284+
[
285+
("Это русский тестовый файл Python", "russian_code.py", "ru"),
286+
("هذا ملف اختبار كود بايثون عربي", "arabic_code.py", "ar"),
287+
],
288+
)
236289
async def test_e2e_russian_arabic_output_language(self, content, file_name, expected_lang):
237290
"""端到端测试:俄文和阿拉伯文内容"""
238291
from openviking.storage.queuefs.semantic_processor import SemanticProcessor
@@ -241,8 +294,14 @@ async def test_e2e_russian_arabic_output_language(self, content, file_name, expe
241294
mock_viking_fs = self._create_mock_viking_fs(content)
242295
mock_config = self._create_mock_config(mock_vlm)
243296

244-
with patch("openviking.storage.queuefs.semantic_processor.get_viking_fs", return_value=mock_viking_fs):
245-
with patch("openviking.storage.queuefs.semantic_processor.get_openviking_config", return_value=mock_config):
297+
with patch(
298+
"openviking.storage.queuefs.semantic_processor.get_viking_fs",
299+
return_value=mock_viking_fs,
300+
):
301+
with patch(
302+
"openviking.storage.queuefs.semantic_processor.get_openviking_config",
303+
return_value=mock_config,
304+
):
246305
processor = SemanticProcessor()
247306
processor._current_ctx = MagicMock()
248307

@@ -255,5 +314,6 @@ async def test_e2e_russian_arabic_output_language(self, content, file_name, expe
255314
prompt_sent = mock_vlm.prompts_received[0]
256315
assert f"Output Language: {expected_lang}" in prompt_sent
257316

258-
assert _verify_content_language(result["summary"], expected_lang), \
317+
assert _verify_content_language(result["summary"], expected_lang), (
259318
f"{file_name}: Content language mismatch. Expected {expected_lang}, got: {result['summary']}"
319+
)

0 commit comments

Comments
 (0)