Skip to content

Commit bdfd540

Browse files
Yuning Bianjixua
authored andcommitted
feat(splitter): 支持语义分片粒度配置 (#133)
新增 CHUNKING_SEMANTIC_UNIT 配置,支持 sentence / paragraph。 段落模式下按段落计算相似度,超长段落仍做长度保底拆分。 补充分片、配置校验和工厂接线测试,并同步分块配置文档。 删除 .claude/skills 链接。
1 parent 5a906ef commit bdfd540

10 files changed

Lines changed: 192 additions & 11 deletions

File tree

.claude/skills

Lines changed: 0 additions & 1 deletion
This file was deleted.

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ MARKDOWN_PARSER_VISION_CONCURRENCY=24
5050
CHUNKING_ENABLE_ADVANCED_PIPELINE=true
5151
CHUNKING_HEADING_BREAK_LEVEL=3
5252
CHUNKING_SEMANTIC_PERCENTILE=95
53+
CHUNKING_SEMANTIC_UNIT=sentence
5354
CHUNKING_MIN_CHUNK_TOKENS=150
5455
CHUNKING_MAX_CHUNK_TOKENS=512
5556
CHUNKING_OVERLAP_TOKENS=64

docs/internals/chunking.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,13 @@ class BaseChunker(ABC):
8585

8686
核心思路:
8787

88-
- 先把文本拆成段落、行或句子级原子单元
88+
- 先按 `semantic_unit` 配置把文本拆成语义比较原子;默认 `sentence` 保持原有段落、行、句子逐级降级行为,`paragraph` 则以段落作为相似度计算单位
8989
- 调用 embedding 模型计算相邻原子的语义距离。
9090
- 使用距离分位数作为动态阈值寻找断点。
9191
-`min_chunk_tokens``max_chunk_tokens``overlap_tokens` 控制。
9292

93+
`paragraph` 模式只改变相似度计算粒度:单个段落超过 `max_chunk_tokens` 时,不会再改用句子级 embedding 计算断点,但最终输出仍会做长度保底拆分,避免生成超长 Chunk。
94+
9395
它通常不直接作为主分片器使用,而是被 `StructuredSemanticChunker` 注入。
9496

9597
### 3.3 StructuredSemanticChunker

docs/ops/configure.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
| `CHUNKING_OVERLAP_TOKENS` | 64 | 提升召回时加大 |
9090
| `CHUNKING_HEADING_BREAK_LEVEL` | 3 | 提升结构敏感性时减小 |
9191
| `CHUNKING_SEMANTIC_PERCENTILE` | 95 | 调整语义边界严格度 |
92+
| `CHUNKING_SEMANTIC_UNIT` | `sentence` | 语义相似度计算粒度:`sentence` / `paragraph` |
9293
| `CHUNKING_EMBED_BATCH_SIZE` | 32 | 受向量服务并发上限约束 |
9394

9495
详细分块策略见 [chunking.md](../internals/chunking.md)

src/config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,21 @@ def assemble_redis_url(cls, v: Optional[str], info) -> str:
109109
CHUNKING_ENABLE_ADVANCED_PIPELINE: bool = True
110110
CHUNKING_HEADING_BREAK_LEVEL: int = 3
111111
CHUNKING_SEMANTIC_PERCENTILE: float = 95.0
112+
CHUNKING_SEMANTIC_UNIT: str = "sentence"
112113
CHUNKING_MIN_CHUNK_TOKENS: int = 150
113114
CHUNKING_MAX_CHUNK_TOKENS: int = 512
114115
CHUNKING_OVERLAP_TOKENS: int = 64
115116
CHUNKING_MIN_DISTANCE_GATE: float = 0.25
116117
CHUNKING_EMBED_BATCH_SIZE: int = 32
117118

119+
@field_validator("CHUNKING_SEMANTIC_UNIT")
120+
@classmethod
121+
def validate_chunking_semantic_unit(cls, v: str) -> str:
122+
normalized = v.strip().lower()
123+
if normalized not in {"sentence", "paragraph"}:
124+
raise ValueError("CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'")
125+
return normalized
126+
118127
# ==========================================
119128
# 向量数据库配置 (Vector Store)
120129
# ==========================================

src/core/splitter/factory.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from src.config import settings
1414
from src.core.llm.factory import ModelFactory
15-
from src.core.llm.interfaces import CapabilityType
15+
from src.core.llm.interfaces import CapabilityType, IEmbedder
1616
from src.core.llm.tokenizer import Tokenizer
1717

1818
from .chunking_engine import ChunkingEngine
@@ -63,7 +63,7 @@ def __init__(
6363
)
6464

6565

66-
class LazyEmbeddingClient:
66+
class LazyEmbeddingClient(IEmbedder):
6767
"""延迟初始化的 Embedding 客户端包装器。
6868
6969
Chunk 索引并非主链路 ACK 的前置条件。延迟创建 Embedding 客户端可以避免
@@ -130,6 +130,7 @@ def create_chunking_engine() -> ChunkingEngine:
130130
embedder=embedder,
131131
tokenizer=Tokenizer(),
132132
percentile=settings.CHUNKING_SEMANTIC_PERCENTILE,
133+
semantic_unit=settings.CHUNKING_SEMANTIC_UNIT,
133134
min_chunk_tokens=settings.CHUNKING_MIN_CHUNK_TOKENS,
134135
max_chunk_tokens=settings.CHUNKING_MAX_CHUNK_TOKENS,
135136
overlap_tokens=settings.CHUNKING_OVERLAP_TOKENS,

src/core/splitter/semantic_chunker.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class PercentileSemanticChunker:
5151
"""
5252

5353
SENTENCE_BOUNDARY_RE = re.compile(r"(?<=[。!?;])|(?<=[.?!;])\s+")
54+
SUPPORTED_SEMANTIC_UNITS = frozenset({"sentence", "paragraph"})
5455

5556
def __init__(
5657
self,
@@ -62,6 +63,7 @@ def __init__(
6263
overlap_tokens: int = 50,
6364
overlap_percentage: float | None = None,
6465
min_distance_gate: float = 0.25,
66+
semantic_unit: str = "sentence",
6567
):
6668
"""
6769
初始化语义切片器及其阈值、长度约束与 overlap 配置。
@@ -75,10 +77,12 @@ def __init__(
7577
overlap_tokens: 相邻 Chunk 的固定 token overlap 上限。
7678
overlap_percentage: 可选的 overlap 百分比配置;当 `overlap_tokens` 为 0 时启用。
7779
min_distance_gate: 绝对最小语义距离阈值,用于避免过度切分。
80+
semantic_unit: 语义相似度计算粒度,支持 `sentence` 或 `paragraph`。
7881
7982
Returns:
8083
None.
8184
"""
85+
semantic_unit = semantic_unit.strip().lower()
8286
if not 0 < percentile <= 100:
8387
raise ValueError("percentile must be in (0, 100].")
8488
if min_chunk_tokens <= 0:
@@ -93,6 +97,9 @@ def __init__(
9397
raise ValueError("overlap_percentage must be in [0, 1).")
9498
if min_distance_gate < 0:
9599
raise ValueError("min_distance_gate cannot be negative.")
100+
if semantic_unit not in self.SUPPORTED_SEMANTIC_UNITS:
101+
supported = ", ".join(sorted(self.SUPPORTED_SEMANTIC_UNITS))
102+
raise ValueError(f"semantic_unit must be one of: {supported}.")
96103

97104
self.embedder = embedder
98105
self.tokenizer = tokenizer
@@ -102,6 +109,7 @@ def __init__(
102109
self.overlap_tokens = overlap_tokens
103110
self.overlap_percentage = overlap_percentage
104111
self.min_distance_gate = min_distance_gate
112+
self.semantic_unit = semantic_unit
105113
self.last_stats = SemanticChunkingStats()
106114

107115
def _resolve_overlap_tokens(self) -> int:
@@ -243,6 +251,18 @@ def _split_by_sentences(self, text: str) -> List[str]:
243251
atoms.append(current)
244252
return atoms
245253

254+
def _split_paragraphs(self, text: str) -> List[str]:
255+
"""
256+
按 Markdown 段落边界切分文本,过滤空段落并保留段落内部换行。
257+
258+
Args:
259+
text: 待切分的大文本块。
260+
261+
Returns:
262+
List[str]: 非空段落列表。
263+
"""
264+
return [paragraph.strip() for paragraph in re.split(r"\n{2,}", text) if paragraph.strip()]
265+
246266
def _atomize_text(self, text: str) -> List[str]:
247267
"""
248268
执行原子化拆解,优先按段落切分,必要时降级为按行或按句切分。
@@ -253,7 +273,10 @@ def _atomize_text(self, text: str) -> List[str]:
253273
Returns:
254274
List[str]: 原子化后的文本单元列表。
255275
"""
256-
paragraphs = [paragraph.strip() for paragraph in re.split(r"\n{2,}", text) if paragraph.strip()]
276+
paragraphs = self._split_paragraphs(text)
277+
if self.semantic_unit == "paragraph":
278+
return paragraphs
279+
257280
atoms: List[str] = []
258281

259282
for paragraph in paragraphs:
@@ -274,6 +297,25 @@ def _atomize_text(self, text: str) -> List[str]:
274297

275298
return [atom for atom in atoms if atom.strip()]
276299

300+
def _append_limited_chunk(self, chunks: list[str], text: str) -> None:
301+
"""
302+
追加最终 Chunk 文本;若文本超长,则只做长度保底拆分。
303+
304+
Args:
305+
chunks: 待追加的最终 Chunk 列表。
306+
text: 当前待落盘的 Chunk 文本。
307+
308+
Returns:
309+
None.
310+
"""
311+
cleaned = text.strip()
312+
if not cleaned:
313+
return
314+
if self._count_tokens(cleaned) <= self.max_chunk_tokens:
315+
chunks.append(cleaned)
316+
return
317+
chunks.extend(self._split_oversized_text(cleaned))
318+
277319
@staticmethod
278320
def _compute_distances(embeddings: Sequence[Sequence[float]]) -> list[float]:
279321
"""
@@ -391,7 +433,9 @@ def _group_atom_indices(
391433

392434
for idx in range(1, len(atoms)):
393435
next_atom = atoms[idx].strip()
394-
distance = distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
436+
distance = (
437+
distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
438+
)
395439

396440
semantic_breakpoint = (
397441
distance is not None
@@ -454,7 +498,9 @@ def _merge_atoms(
454498

455499
for idx in range(1, len(atoms)):
456500
next_atom = atoms[idx].strip()
457-
distance = distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
501+
distance = (
502+
distances[idx - 1] if distances is not None and idx - 1 < len(distances) else None
503+
)
458504

459505
semantic_breakpoint = (
460506
distance is not None
@@ -469,15 +515,15 @@ def _merge_atoms(
469515
if overflow_forced or (
470516
semantic_breakpoint and self._count_tokens(current_text) >= self.min_chunk_tokens
471517
):
472-
chunks.append(current_text)
518+
self._append_limited_chunk(chunks, current_text)
473519
if semantic_breakpoint and not overflow_forced:
474520
breakpoints.append(idx - 1)
475521
current_text = self._build_next_chunk(current_text, next_atom)
476522
else:
477523
current_text = merged_candidate
478524

479525
if current_text:
480-
chunks.append(current_text)
526+
self._append_limited_chunk(chunks, current_text)
481527

482528
self.last_stats = SemanticChunkingStats(
483529
atom_count=len(atoms),
@@ -509,7 +555,8 @@ async def group_texts(self, texts: Sequence[str]) -> list[list[int]]:
509555

510556
try:
511557
embedding_result = await self.embedder.embed(list(atoms))
512-
embeddings = [list(map(float, vector)) for vector in embedding_result.embeddings]
558+
raw_embeddings = getattr(embedding_result, "embeddings", None) or []
559+
embeddings = [list(map(float, vector)) for vector in raw_embeddings]
513560
if len(embeddings) != len(atoms) or any(not vector for vector in embeddings):
514561
raise ValueError(
515562
f"Embedding shape mismatch: got {len(embeddings)} vectors, expected {len(atoms)}."
@@ -547,7 +594,8 @@ async def split(self, text_block: str) -> List[str]:
547594

548595
try:
549596
embedding_result = await self.embedder.embed(list(atoms))
550-
embeddings = [list(map(float, vector)) for vector in embedding_result.embeddings]
597+
raw_embeddings = getattr(embedding_result, "embeddings", None) or []
598+
embeddings = [list(map(float, vector)) for vector in raw_embeddings]
551599
if len(embeddings) != len(atoms) or any(not vector for vector in embeddings):
552600
raise ValueError(
553601
f"Embedding shape mismatch: got {len(embeddings)} vectors, expected {len(atoms)}."
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from __future__ import annotations
2+
3+
import src.core.splitter.factory as factory
4+
from src.core.llm.interfaces import CapabilityType
5+
from src.core.splitter import StructuredSemanticChunker
6+
7+
8+
class _FakeEmbedder:
9+
def has_capability(self, capability):
10+
return capability == CapabilityType.EMBEDDING
11+
12+
13+
def test_create_chunking_engine_should_pass_semantic_unit_from_settings(monkeypatch):
14+
monkeypatch.setattr(factory.settings, "CHUNKING_ENABLE_ADVANCED_PIPELINE", True)
15+
monkeypatch.setattr(factory.settings, "CHUNKING_SEMANTIC_UNIT", "paragraph")
16+
monkeypatch.setattr(factory, "create_system_embedding_client", lambda: _FakeEmbedder())
17+
18+
engine = factory.create_chunking_engine()
19+
20+
assert isinstance(engine.chunker, StructuredSemanticChunker)
21+
assert engine.chunker.semantic_chunker.semantic_unit == "paragraph"

tests/unit/core/splitter/test_semantic_chunker.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ async def embed(self, texts, model=None, **kwargs):
2929
return MockEmbeddingResult(self._embeddings)
3030

3131

32+
class RecordingEmbedder(StaticEmbedder):
33+
def __init__(self, embeddings):
34+
super().__init__(embeddings)
35+
self.calls = []
36+
37+
async def embed(self, texts, model=None, **kwargs):
38+
self.calls.append(list(texts))
39+
return await super().embed(texts, model=model, **kwargs)
40+
41+
3242
class FailingEmbedder:
3343
async def embed(self, texts, model=None, **kwargs):
3444
raise RuntimeError("mock embedding failure")
@@ -167,3 +177,77 @@ async def test_split_should_fallback_to_length_only_when_embedding_fails():
167177
"seven eight nine",
168178
]
169179
assert chunker.last_stats.fallback_used is True
180+
181+
182+
async def test_split_should_use_paragraphs_as_semantic_units_when_configured():
183+
tokenizer = MockWordTokenizer()
184+
embedder = RecordingEmbedder(
185+
[
186+
[1.0, 0.0],
187+
[1.0, 0.0],
188+
[0.0, 1.0],
189+
]
190+
)
191+
chunker = PercentileSemanticChunker(
192+
embedder=embedder,
193+
tokenizer=tokenizer,
194+
percentile=50,
195+
semantic_unit="paragraph",
196+
min_chunk_tokens=1,
197+
max_chunk_tokens=20,
198+
overlap_tokens=0,
199+
min_distance_gate=0.25,
200+
)
201+
202+
first_paragraph = "alpha one. alpha two. alpha three. alpha four."
203+
second_paragraph = "alpha five."
204+
third_paragraph = "beta one."
205+
text = "\n\n".join([first_paragraph, second_paragraph, third_paragraph])
206+
207+
chunks = await chunker.split(text)
208+
209+
assert embedder.calls == [[first_paragraph, second_paragraph, third_paragraph]]
210+
assert chunks == [f"{first_paragraph}\n\n{second_paragraph}", third_paragraph]
211+
assert chunker.last_stats.atom_count == 3
212+
assert chunker.last_stats.breakpoints == [1]
213+
214+
215+
async def test_split_should_length_split_oversized_paragraph_in_paragraph_mode():
216+
tokenizer = MockWordTokenizer()
217+
embedder = RecordingEmbedder(
218+
[
219+
[1.0, 0.0],
220+
[1.0, 0.0],
221+
]
222+
)
223+
chunker = PercentileSemanticChunker(
224+
embedder=embedder,
225+
tokenizer=tokenizer,
226+
percentile=50,
227+
semantic_unit="paragraph",
228+
min_chunk_tokens=1,
229+
max_chunk_tokens=4,
230+
overlap_tokens=0,
231+
min_distance_gate=0.25,
232+
)
233+
234+
long_paragraph = "p1 p2 p3 p4 p5 p6"
235+
short_paragraph = "p7 p8"
236+
chunks = await chunker.split(f"{long_paragraph}\n\n{short_paragraph}")
237+
238+
assert embedder.calls == [[long_paragraph, short_paragraph]]
239+
assert chunks == ["p1 p2 p3 p4", "p5 p6", short_paragraph]
240+
assert all(tokenizer.count_tokens(chunk) <= 4 for chunk in chunks)
241+
242+
243+
def test_splitter_should_reject_invalid_semantic_unit():
244+
try:
245+
PercentileSemanticChunker(
246+
embedder=FailingEmbedder(),
247+
tokenizer=MockWordTokenizer(),
248+
semantic_unit="section",
249+
)
250+
except ValueError as exc:
251+
assert "semantic_unit must be one of" in str(exc)
252+
else:
253+
raise AssertionError("expected ValueError")

tests/unit/test_config_sparse_vector.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,18 @@ def test_should_enable_sparse_vector_by_default():
77
settings = Settings(_env_file=None)
88

99
assert settings.SPARSE_VECTOR_ENABLED is True
10+
11+
12+
def test_should_normalize_chunking_semantic_unit():
13+
settings = Settings(_env_file=None, CHUNKING_SEMANTIC_UNIT=" Paragraph ")
14+
15+
assert settings.CHUNKING_SEMANTIC_UNIT == "paragraph"
16+
17+
18+
def test_should_reject_invalid_chunking_semantic_unit():
19+
try:
20+
Settings(_env_file=None, CHUNKING_SEMANTIC_UNIT="section")
21+
except ValueError as exc:
22+
assert "CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'" in str(exc)
23+
else:
24+
raise AssertionError("expected ValueError")

0 commit comments

Comments
 (0)