diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32acc5d..87ad71930d 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -2,13 +2,32 @@ 文本处理服务 """ +import re from typing import List, Optional from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: """文本处理器""" - + + # 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。 + GRAPH_META_SECTION_PATTERNS = ( + r"给\s*MiroFish\s*/\s*知识图谱的约定", + r"剧集编号与惯用标题", + r"防误抽", + ) + + GRAPH_META_LINE_PATTERNS = ( + r"本文供\s*\*{0,2}MiroFish", + r"供\s*MiroFish.*抽取.*推演", + r"请勿.*Agent", + r"请勿.*节点", + r"非角色", + r"抽取实体时请合并", + r"平行假设放哪", + r"勿把平行结局写进本文件正文", + ) + @staticmethod def extract_from_files(file_paths: List[str]) -> str: """从多个文件提取文本""" @@ -31,6 +50,7 @@ def split_text( Returns: 文本块列表 """ + text = TextProcessor.preprocess_text(text) return split_text_into_chunks(text, chunk_size, overlap) @staticmethod @@ -39,6 +59,8 @@ def preprocess_text(text: str) -> str: 预处理文本 - 移除多余空白 - 标准化换行 + - 保守清理“说明层/导航层”文本 + - 删除剧集编号与英文单集标题等非主体锚点 Args: text: 原始文本 @@ -46,18 +68,68 @@ def preprocess_text(text: str) -> str: Returns: 处理后的文本 """ - import re - # 标准化换行 text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + raw_lines = [line.strip() for line in text.split('\n')] + title_pattern = re.compile(r"(?= 2: + canonical = re.sub(r"\*+", "", columns[0]).strip() + note = columns[1] + line = f"| {canonical} | {note.strip()} |" + + if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS): + continue + + line = re.sub(r"\bS\d{2}E\d{2}\b", "", line) + line = re.sub(r"(? dict: "total_lines": text.count('\n') + 1, "total_words": len(text.split()), } - diff --git a/backend/tests/test_text_processor.py b/backend/tests/test_text_processor.py new file mode 100644 index 0000000000..8ff370fed7 --- /dev/null +++ b/backend/tests/test_text_processor.py @@ -0,0 +1,64 @@ +from importlib.util import module_from_spec, spec_from_file_location +from pathlib import Path +import sys + + +def _load_text_processor(): + backend_dir = Path(__file__).resolve().parents[1] + if str(backend_dir) not in sys.path: + sys.path.insert(0, str(backend_dir)) + + module_path = backend_dir / "app" / "services" / "text_processor.py" + spec = spec_from_file_location("app.services.text_processor_test", module_path) + module = module_from_spec(spec) + assert spec and spec.loader + spec.loader.exec_module(module) + return module.TextProcessor + + +TextProcessor = _load_text_processor() + + +def test_preprocess_text_removes_graph_meta_and_episode_titles(): + text = """ +本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。 + +## 给 MiroFish / 知识图谱的约定(请先读) +1. **剧集怎样引用**:S04E13 *Face Off* 不是角色,勿单独建 Agent 节点。 + +## Walter White(一类) +高中化学教师,已以 **Heisenberg** 身份与 Jesse 制毒。 +7. *End Times*:利用 Brock 急病建构 Gus 陷害叙事。 +8. *Face Off* 正史:与 Hector 合谋炸弹杀 Gus。 +""" + + processed = TextProcessor.preprocess_text(text) + + assert "MiroFish" not in processed + assert "8. *Face Off*" not in processed + assert "S04E13" not in processed + assert "Walter White" in processed + assert "Heisenberg" in processed + assert "Gus" in processed + + +def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise(): + text = """ +## 命名规范(抽取实体时请合并为单一节点) +| Walter White | 亦称 Walt;**Heisenberg** 仍指同一人,勿拆节点。 | +| Hank Schrader | DEA 探员。**DEA 为机构**,勿与 Hank 重复为同级「人物节点」。 | + +## 剧集编号与惯用标题(防误抽) +| **S04E13** | *Face Off* | +- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。 +""" + + processed = TextProcessor.preprocess_text(text) + + assert "Graph Extraction Hints" not in processed + assert "Walter White" in processed + assert "Heisenberg" in processed + assert "DEA 为机构" in processed + assert "S04E13" not in processed + assert "Face Off" not in processed + assert "Mandala" not in processed