feat(splitter): 支持语义分片粒度配置 (#133)

Yuning Bian · jixua · commit bdfd5409144f · 2026-06-06T13:36:30.000+08:00
新增 CHUNKING_SEMANTIC_UNIT 配置，支持 sentence / paragraph。

段落模式下按段落计算相似度，超长段落仍做长度保底拆分。

补充分片、配置校验和工厂接线测试，并同步分块配置文档。

删除 .claude/skills 链接。
diff --git a/.claude/skills b/.claude/skills
diff --git a/.env.example b/.env.example
@@ -50,6 +50,7 @@ MARKDOWN_PARSER_VISION_CONCURRENCY=24
 CHUNKING_ENABLE_ADVANCED_PIPELINE=true
 CHUNKING_HEADING_BREAK_LEVEL=3
 CHUNKING_SEMANTIC_PERCENTILE=95
+CHUNKING_SEMANTIC_UNIT=sentence
 CHUNKING_MIN_CHUNK_TOKENS=150
 CHUNKING_MAX_CHUNK_TOKENS=512
 CHUNKING_OVERLAP_TOKENS=64
diff --git a/docs/internals/chunking.md b/docs/internals/chunking.md
@@ -85,11 +85,13 @@ class BaseChunker(ABC):
 
 核心思路：
 
-- 先把文本拆成段落、行或句子级原子单元。
+- 先按 `semantic_unit` 配置把文本拆成语义比较原子；默认 `sentence` 保持原有段落、行、句子逐级降级行为，`paragraph` 则以段落作为相似度计算单位。
 - 调用 embedding 模型计算相邻原子的语义距离。
 - 使用距离分位数作为动态阈值寻找断点。
 - 受 `min_chunk_tokens`、`max_chunk_tokens`、`overlap_tokens` 控制。
 
+`paragraph` 模式只改变相似度计算粒度：单个段落超过 `max_chunk_tokens` 时，不会再改用句子级 embedding 计算断点，但最终输出仍会做长度保底拆分，避免生成超长 Chunk。
+
 它通常不直接作为主分片器使用，而是被 `StructuredSemanticChunker` 注入。
 
 ### 3.3 StructuredSemanticChunker
diff --git a/docs/ops/configure.md b/docs/ops/configure.md
@@ -89,6 +89,7 @@
 | `CHUNKING_OVERLAP_TOKENS` | 64 | 提升召回时加大 |
 | `CHUNKING_HEADING_BREAK_LEVEL` | 3 | 提升结构敏感性时减小 |
 | `CHUNKING_SEMANTIC_PERCENTILE` | 95 | 调整语义边界严格度 |
+| `CHUNKING_SEMANTIC_UNIT` | `sentence` | 语义相似度计算粒度：`sentence` / `paragraph` |
 | `CHUNKING_EMBED_BATCH_SIZE` | 32 | 受向量服务并发上限约束 |
 
 详细分块策略见 [chunking.md](../internals/chunking.md)。
diff --git a/src/config.py b/src/config.py
@@ -109,12 +109,21 @@ def assemble_redis_url(cls, v: Optional[str], info) -> str:
     CHUNKING_ENABLE_ADVANCED_PIPELINE: bool = True
     CHUNKING_HEADING_BREAK_LEVEL: int = 3
     CHUNKING_SEMANTIC_PERCENTILE: float = 95.0
+    CHUNKING_SEMANTIC_UNIT: str = "sentence"
     CHUNKING_MIN_CHUNK_TOKENS: int = 150
     CHUNKING_MAX_CHUNK_TOKENS: int = 512
     CHUNKING_OVERLAP_TOKENS: int = 64
     CHUNKING_MIN_DISTANCE_GATE: float = 0.25
     CHUNKING_EMBED_BATCH_SIZE: int = 32
 
+    @field_validator("CHUNKING_SEMANTIC_UNIT")
+    @classmethod
+    def validate_chunking_semantic_unit(cls, v: str) -> str:
+        normalized = v.strip().lower()
+        if normalized not in {"sentence", "paragraph"}:
+            raise ValueError("CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'")
+        return normalized
+
     # ==========================================
     # 向量数据库配置 (Vector Store)
     # ==========================================
diff --git a/src/core/splitter/factory.py b/src/core/splitter/factory.py
@@ -12,7 +12,7 @@
 
 from src.config import settings
 from src.core.llm.factory import ModelFactory
-from src.core.llm.interfaces import CapabilityType
+from src.core.llm.interfaces import CapabilityType, IEmbedder
 from src.core.llm.tokenizer import Tokenizer
 
 from .chunking_engine import ChunkingEngine
@@ -63,7 +63,7 @@ def __init__(
         )
 
 
-class LazyEmbeddingClient:
+class LazyEmbeddingClient(IEmbedder):
     """延迟初始化的 Embedding 客户端包装器。
 
     Chunk 索引并非主链路 ACK 的前置条件。延迟创建 Embedding 客户端可以避免
@@ -130,6 +130,7 @@ def create_chunking_engine() -> ChunkingEngine:
             embedder=embedder,
             tokenizer=Tokenizer(),
             percentile=settings.CHUNKING_SEMANTIC_PERCENTILE,
+            semantic_unit=settings.CHUNKING_SEMANTIC_UNIT,
             min_chunk_tokens=settings.CHUNKING_MIN_CHUNK_TOKENS,
             max_chunk_tokens=settings.CHUNKING_MAX_CHUNK_TOKENS,
             overlap_tokens=settings.CHUNKING_OVERLAP_TOKENS,
diff --git a/src/core/splitter/semantic_chunker.py b/src/core/splitter/semantic_chunker.py
@@ -51,6 +51,7 @@ class PercentileSemanticChunker:
     """
 
     SENTENCE_BOUNDARY_RE = re.compile(r"(?<=[。！？；])|(?<=[.?!;])\s+")
+    SUPPORTED_SEMANTIC_UNITS = frozenset({"sentence", "paragraph"})
 
     def __init__(
         self,
@@ -62,6 +63,7 @@ def __init__(
         overlap_tokens: int = 50,
         overlap_percentage: float | None = None,
         min_distance_gate: float = 0.25,
+        semantic_unit: str = "sentence",
     ):
         """
             初始化语义切片器及其阈值、长度约束与 overlap 配置。
@@ -75,10 +77,12 @@ def __init__(
             overlap_tokens: 相邻 Chunk 的固定 token overlap 上限。
             overlap_percentage: 可选的 overlap 百分比配置；当 `overlap_tokens` 为 0 时启用。
             min_distance_gate: 绝对最小语义距离阈值，用于避免过度切分。
+            semantic_unit: 语义相似度计算粒度，支持 `sentence` 或 `paragraph`。
 
         Returns:
             None.
         """
+        semantic_unit = semantic_unit.strip().lower()
         if not 0 < percentile <= 100:
             raise ValueError("percentile must be in (0, 100].")
         if min_chunk_tokens <= 0:
@@ -93,6 +97,9 @@ def __init__(
             raise ValueError("overlap_percentage must be in [0, 1).")
         if min_distance_gate < 0:
             raise ValueError("min_distance_gate cannot be negative.")
+        if semantic_unit not in self.SUPPORTED_SEMANTIC_UNITS:
+            supported = ", ".join(sorted(self.SUPPORTED_SEMANTIC_UNITS))
+            raise ValueError(f"semantic_unit must be one of: {supported}.")
 
         self.embedder = embedder
         self.tokenizer = tokenizer
@@ -102,6 +109,7 @@ def __init__(
         self.overlap_tokens = overlap_tokens
         self.overlap_percentage = overlap_percentage
         self.min_distance_gate = min_distance_gate
+        self.semantic_unit = semantic_unit
         self.last_stats = SemanticChunkingStats()
 
     def _resolve_overlap_tokens(self) -> int:
@@ -243,6 +251,18 @@ def _split_by_sentences(self, text: str) -> List[str]:
             atoms.append(current)
         return atoms
 
+    def _split_paragraphs(self, text: str) -> List[str]:
+        """
+            按 Markdown 段落边界切分文本，过滤空段落并保留段落内部换行。
+
+        Args:
+            text: 待切分的大文本块。
+
+        Returns:
+            List[str]: 非空段落列表。
+        """
+        return [paragraph.strip() for paragraph in re.split(r"\n{2,}", text) if paragraph.strip()]
+
     def _atomize_text(self, text: str) -> List[str]:
         """
             执行原子化拆解，优先按段落切分，必要时降级为按行或按句切分。
@@ -253,7 +273,10 @@ def _atomize_text(self, text: str) -> List[str]:
         Returns:
             List[str]: 原子化后的文本单元列表。
         """
-        paragraphs = [paragraph.strip() for paragraph in re.split(r"\n{2,}", text) if paragraph.strip()]
+        paragraphs = self._split_paragraphs(text)
+        if self.semantic_unit == "paragraph":
+            return paragraphs
+
         atoms: List[str] = []
 
         for paragraph in paragraphs:
@@ -274,6 +297,25 @@ def _atomize_text(self, text: str) -> List[str]:
 
         return [atom for atom in atoms if atom.strip()]
 
+    def _append_limited_chunk(self, chunks: list[str], text: str) -> None:
+        """
+            追加最终 Chunk 文本；若文本超长，则只做长度保底拆分。
+
+        Args:
+            chunks: 待追加的最终 Chunk 列表。
+            text: 当前待落盘的 Chunk 文本。
+
+        Returns:
+            None.
+        """
+        cleaned = text.strip()
+        if not cleaned:
+            return
+        if self._count_tokens(cleaned) <= self.max_chunk_tokens:
+            chunks.append(cleaned)
+            return
+        chunks.extend(self._split_oversized_text(cleaned))
+
     @staticmethod
     def _compute_distances(embeddings: Sequence[Sequence[float]]) -> list[float]:
         """
@@ -391,7 +433,9 @@ def _group_atom_indices(
 
         for idx in range(1, len(atoms)):
             next_atom = atoms[idx].strip()
-            distance = distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
+            distance = (
+                distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
+            )
 
             semantic_breakpoint = (
                 distance is not None
@@ -454,7 +498,9 @@ def _merge_atoms(
 
         for idx in range(1, len(atoms)):
             next_atom = atoms[idx].strip()
-            distance = distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
+            distance = (
+                distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
+            )
 
             semantic_breakpoint = (
                 distance is not None
@@ -469,15 +515,15 @@ def _merge_atoms(
             if overflow_forced or (
                 semantic_breakpoint and self._count_tokens(current_text) >= self.min_chunk_tokens
             ):
-                chunks.append(current_text)
+                self._append_limited_chunk(chunks, current_text)
                 if semantic_breakpoint and not overflow_forced:
                     breakpoints.append(idx - 1)
                 current_text = self._build_next_chunk(current_text, next_atom)
             else:
                 current_text = merged_candidate
 
         if current_text:
-            chunks.append(current_text)
+            self._append_limited_chunk(chunks, current_text)
 
         self.last_stats = SemanticChunkingStats(
             atom_count=len(atoms),
@@ -509,7 +555,8 @@ async def group_texts(self, texts: Sequence[str]) -> list[list[int]]:
 
         try:
             embedding_result = await self.embedder.embed(list(atoms))
-            embeddings = [list(map(float, vector)) for vector in embedding_result.embeddings]
+            raw_embeddings = getattr(embedding_result, "embeddings", None) or []
+            embeddings = [list(map(float, vector)) for vector in raw_embeddings]
             if len(embeddings) != len(atoms) or any(not vector for vector in embeddings):
                 raise ValueError(
                     f"Embedding shape mismatch: got {len(embeddings)} vectors, expected {len(atoms)}."
@@ -547,7 +594,8 @@ async def split(self, text_block: str) -> List[str]:
 
         try:
             embedding_result = await self.embedder.embed(list(atoms))
-            embeddings = [list(map(float, vector)) for vector in embedding_result.embeddings]
+            raw_embeddings = getattr(embedding_result, "embeddings", None) or []
+            embeddings = [list(map(float, vector)) for vector in raw_embeddings]
             if len(embeddings) != len(atoms) or any(not vector for vector in embeddings):
                 raise ValueError(
                     f"Embedding shape mismatch: got {len(embeddings)} vectors, expected {len(atoms)}."
diff --git a/tests/unit/core/splitter/test_factory.py b/tests/unit/core/splitter/test_factory.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import src.core.splitter.factory as factory
+from src.core.llm.interfaces import CapabilityType
+from src.core.splitter import StructuredSemanticChunker
+
+
+class _FakeEmbedder:
+    def has_capability(self, capability):
+        return capability == CapabilityType.EMBEDDING
+
+
+def test_create_chunking_engine_should_pass_semantic_unit_from_settings(monkeypatch):
+    monkeypatch.setattr(factory.settings, "CHUNKING_ENABLE_ADVANCED_PIPELINE", True)
+    monkeypatch.setattr(factory.settings, "CHUNKING_SEMANTIC_UNIT", "paragraph")
+    monkeypatch.setattr(factory, "create_system_embedding_client", lambda: _FakeEmbedder())
+
+    engine = factory.create_chunking_engine()
+
+    assert isinstance(engine.chunker, StructuredSemanticChunker)
+    assert engine.chunker.semantic_chunker.semantic_unit == "paragraph"
diff --git a/tests/unit/core/splitter/test_semantic_chunker.py b/tests/unit/core/splitter/test_semantic_chunker.py
@@ -29,6 +29,16 @@ async def embed(self, texts, model=None, **kwargs):
         return MockEmbeddingResult(self._embeddings)
 
 
+class RecordingEmbedder(StaticEmbedder):
+    def __init__(self, embeddings):
+        super().__init__(embeddings)
+        self.calls = []
+
+    async def embed(self, texts, model=None, **kwargs):
+        self.calls.append(list(texts))
+        return await super().embed(texts, model=model, **kwargs)
+
+
 class FailingEmbedder:
     async def embed(self, texts, model=None, **kwargs):
         raise RuntimeError("mock embedding failure")
@@ -167,3 +177,77 @@ async def test_split_should_fallback_to_length_only_when_embedding_fails():
         "seven eight nine",
     ]
     assert chunker.last_stats.fallback_used is True
+
+
+async def test_split_should_use_paragraphs_as_semantic_units_when_configured():
+    tokenizer = MockWordTokenizer()
+    embedder = RecordingEmbedder(
+        [
+            [1.0, 0.0],
+            [1.0, 0.0],
+            [0.0, 1.0],
+        ]
+    )
+    chunker = PercentileSemanticChunker(
+        embedder=embedder,
+        tokenizer=tokenizer,
+        percentile=50,
+        semantic_unit="paragraph",
+        min_chunk_tokens=1,
+        max_chunk_tokens=20,
+        overlap_tokens=0,
+        min_distance_gate=0.25,
+    )
+
+    first_paragraph = "alpha one. alpha two. alpha three. alpha four."
+    second_paragraph = "alpha five."
+    third_paragraph = "beta one."
+    text = "\n\n".join([first_paragraph, second_paragraph, third_paragraph])
+
+    chunks = await chunker.split(text)
+
+    assert embedder.calls == [[first_paragraph, second_paragraph, third_paragraph]]
+    assert chunks == [f"{first_paragraph}\n\n{second_paragraph}", third_paragraph]
+    assert chunker.last_stats.atom_count == 3
+    assert chunker.last_stats.breakpoints == [1]
+
+
+async def test_split_should_length_split_oversized_paragraph_in_paragraph_mode():
+    tokenizer = MockWordTokenizer()
+    embedder = RecordingEmbedder(
+        [
+            [1.0, 0.0],
+            [1.0, 0.0],
+        ]
+    )
+    chunker = PercentileSemanticChunker(
+        embedder=embedder,
+        tokenizer=tokenizer,
+        percentile=50,
+        semantic_unit="paragraph",
+        min_chunk_tokens=1,
+        max_chunk_tokens=4,
+        overlap_tokens=0,
+        min_distance_gate=0.25,
+    )
+
+    long_paragraph = "p1 p2 p3 p4 p5 p6"
+    short_paragraph = "p7 p8"
+    chunks = await chunker.split(f"{long_paragraph}\n\n{short_paragraph}")
+
+    assert embedder.calls == [[long_paragraph, short_paragraph]]
+    assert chunks == ["p1 p2 p3 p4", "p5 p6", short_paragraph]
+    assert all(tokenizer.count_tokens(chunk) <= 4 for chunk in chunks)
+
+
+def test_splitter_should_reject_invalid_semantic_unit():
+    try:
+        PercentileSemanticChunker(
+            embedder=FailingEmbedder(),
+            tokenizer=MockWordTokenizer(),
+            semantic_unit="section",
+        )
+    except ValueError as exc:
+        assert "semantic_unit must be one of" in str(exc)
+    else:
+        raise AssertionError("expected ValueError")
diff --git a/tests/unit/test_config_sparse_vector.py b/tests/unit/test_config_sparse_vector.py
@@ -7,3 +7,18 @@ def test_should_enable_sparse_vector_by_default():
     settings = Settings(_env_file=None)
 
     assert settings.SPARSE_VECTOR_ENABLED is True
+
+
+def test_should_normalize_chunking_semantic_unit():
+    settings = Settings(_env_file=None, CHUNKING_SEMANTIC_UNIT=" Paragraph ")
+
+    assert settings.CHUNKING_SEMANTIC_UNIT == "paragraph"
+
+
+def test_should_reject_invalid_chunking_semantic_unit():
+    try:
+        Settings(_env_file=None, CHUNKING_SEMANTIC_UNIT="section")
+    except ValueError as exc:
+        assert "CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'" in str(exc)
+    else:
+        raise AssertionError("expected ValueError")