Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 82 additions & 11 deletions backend/app/services/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,32 @@
文本处理服务
"""

import re
from typing import List, Optional
from ..utils.file_parser import FileParser, split_text_into_chunks


class TextProcessor:
"""文本处理器"""


# 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。
GRAPH_META_SECTION_PATTERNS = (
r"给\s*MiroFish\s*/\s*知识图谱的约定",
r"剧集编号与惯用标题",
r"防误抽",
)

GRAPH_META_LINE_PATTERNS = (
r"本文供\s*\*{0,2}MiroFish",
r"供\s*MiroFish.*抽取.*推演",
r"请勿.*Agent",
r"请勿.*节点",
r"非角色",
r"抽取实体时请合并",
r"平行假设放哪",
r"勿把平行结局写进本文件正文",
)

@staticmethod
def extract_from_files(file_paths: List[str]) -> str:
"""从多个文件提取文本"""
Expand All @@ -31,6 +50,7 @@ def split_text(
Returns:
文本块列表
"""
text = TextProcessor.preprocess_text(text)
return split_text_into_chunks(text, chunk_size, overlap)

@staticmethod
Expand All @@ -39,25 +59,77 @@ def preprocess_text(text: str) -> str:
预处理文本
- 移除多余空白
- 标准化换行
- 保守清理“说明层/导航层”文本
- 删除剧集编号与英文单集标题等非主体锚点

Args:
text: 原始文本

Returns:
处理后的文本
"""
import re

# 标准化换行
text = text.replace('\r\n', '\n').replace('\r', '\n')

# 移除连续空行(保留最多两个换行)

raw_lines = [line.strip() for line in text.split('\n')]
title_pattern = re.compile(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)")
non_subject_terms = set()

for line in raw_lines:
if "惯用标题" in line or ("|" in line and re.search(r"\bS\d{2}E\d{2}\b", line)):
for match in title_pattern.findall(line):
non_subject_terms.add(match.strip())

cleaned_lines: List[str] = []
skip_section = False
in_naming_section = False

for line in raw_lines:
if not line:
cleaned_lines.append("")
continue

if line.startswith("## "):
in_naming_section = "命名规范" in line
if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_SECTION_PATTERNS):
skip_section = True
continue
skip_section = False

if skip_section:
continue

if in_naming_section and "|" in line:
columns = [part.strip() for part in line.strip("|").split("|")]
if len(columns) >= 2:
canonical = re.sub(r"\*+", "", columns[0]).strip()
note = columns[1]
line = f"| {canonical} | {note.strip()} |"

if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS):
continue

line = re.sub(r"\bS\d{2}E\d{2}\b", "", line)
line = re.sub(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)", "", line)

for term in sorted(non_subject_terms, key=len, reverse=True):
line = re.sub(rf"\*{{0,3}}{re.escape(term)}\*{{0,3}}", "", line)

if re.fullmatch(r"[|\-:\s]+", line):
continue

simplified = re.sub(r"[\s|:\-*`_]+", "", line)
if not simplified:
continue

line = re.sub(r"\s{2,}", " ", line).strip()
if not line:
continue

cleaned_lines.append(line)

text = "\n".join(cleaned_lines)
text = re.sub(r'\n{3,}', '\n\n', text)

# 移除行首行尾空白
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)

return text.strip()

@staticmethod
Expand All @@ -68,4 +140,3 @@ def get_text_stats(text: str) -> dict:
"total_lines": text.count('\n') + 1,
"total_words": len(text.split()),
}

64 changes: 64 additions & 0 deletions backend/tests/test_text_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path
import sys


def _load_text_processor():
backend_dir = Path(__file__).resolve().parents[1]
if str(backend_dir) not in sys.path:
sys.path.insert(0, str(backend_dir))

module_path = backend_dir / "app" / "services" / "text_processor.py"
spec = spec_from_file_location("app.services.text_processor_test", module_path)
module = module_from_spec(spec)
assert spec and spec.loader
spec.loader.exec_module(module)
return module.TextProcessor


TextProcessor = _load_text_processor()


def test_preprocess_text_removes_graph_meta_and_episode_titles():
text = """
本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。

## 给 MiroFish / 知识图谱的约定(请先读)
1. **剧集怎样引用**:S04E13 *Face Off* 不是角色,勿单独建 Agent 节点。

## Walter White(一类)
高中化学教师,已以 **Heisenberg** 身份与 Jesse 制毒。
7. *End Times*:利用 Brock 急病建构 Gus 陷害叙事。
8. *Face Off* 正史:与 Hector 合谋炸弹杀 Gus。
"""

processed = TextProcessor.preprocess_text(text)

assert "MiroFish" not in processed
assert "8. *Face Off*" not in processed
assert "S04E13" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "Gus" in processed


def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
text = """
## 命名规范(抽取实体时请合并为单一节点)
| Walter White | 亦称 Walt;**Heisenberg** 仍指同一人,勿拆节点。 |
| Hank Schrader | DEA 探员。**DEA 为机构**,勿与 Hank 重复为同级「人物节点」。 |

## 剧集编号与惯用标题(防误抽)
| **S04E13** | *Face Off* |
- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
"""

processed = TextProcessor.preprocess_text(text)

assert "Graph Extraction Hints" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "DEA 为机构" in processed
assert "S04E13" not in processed
assert "Face Off" not in processed
assert "Mandala" not in processed