Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ CHUNKING_SEMANTIC_PERCENTILE=95
CHUNKING_SEMANTIC_UNIT=sentence
CHUNKING_MIN_CHUNK_TOKENS=150
CHUNKING_MAX_CHUNK_TOKENS=512
# 是否启用相邻 chunk overlap;关闭后 CHUNKING_OVERLAP_TOKENS 不生效
CHUNKING_OVERLAP_ENABLED=true
# overlap token 数允许范围:0-64
CHUNKING_OVERLAP_TOKENS=64
CHUNKING_MIN_DISTANCE_GATE=0.25
CHUNKING_EMBED_BATCH_SIZE=32
Expand Down
18 changes: 15 additions & 3 deletions docs/internals/chunking.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ src/core/splitter/
├── chunking_engine.py # Markdown 解析与分片编排入口
├── rule_chunker.py # 基于 Markdown AST 的规则分片
├── semantic_chunker.py # 基于 embedding 距离的语义细分
├── overlap.py # chunk overlap 配置与上下文拼接
├── pipeline_chunker.py # 结构分片 + 语义细分两阶段分片器
└── embedding_pipeline.py # Chunk 向量化批处理管线
```
Expand Down Expand Up @@ -88,13 +89,24 @@ class BaseChunker(ABC):
- 先按 `semantic_unit` 配置把文本拆成语义比较原子;默认 `sentence` 保持原有段落、行、句子逐级降级行为,`paragraph` 则以段落作为相似度计算单位。
- 调用 embedding 模型计算相邻原子的语义距离。
- 使用距离分位数作为动态阈值寻找断点。
- 受 `min_chunk_tokens`、`max_chunk_tokens`、`overlap_tokens` 控制。
- 受 `min_chunk_tokens`、`max_chunk_tokens` 控制;overlap 由独立配置控制,但仍在原切分位置追加,保证算法流程不变

`paragraph` 模式只改变相似度计算粒度:单个段落超过 `max_chunk_tokens` 时,不会再改用句子级 embedding 计算断点,但最终输出仍会做长度保底拆分,避免生成超长 Chunk。

它通常不直接作为主分片器使用,而是被 `StructuredSemanticChunker` 注入。

### 3.3 StructuredSemanticChunker
### 3.3 ChunkOverlapper

`ChunkOverlapper` 负责相邻 Chunk 的上下文 overlap,不参与语义断点计算。

配置:

- `CHUNKING_OVERLAP_ENABLED`:是否启用 overlap。
- `CHUNKING_OVERLAP_TOKENS`:启用后追加的 token 数上限,范围 `0..64`。

`CHUNKING_OVERLAP_ENABLED=false` 或 `CHUNKING_OVERLAP_TOKENS=0` 时,不追加 overlap。默认 `true + 64` 保持现有分片行为。

### 3.4 StructuredSemanticChunker

`StructuredSemanticChunker` 是两阶段分片器:

Expand Down Expand Up @@ -201,7 +213,7 @@ chunks = engine.process(markdown)
修改语义分片时关注:

- token 上下限是否合理。
- overlap 是否造成内容膨胀
- overlap 是否按 `CHUNKING_OVERLAP_ENABLED` 与 `CHUNKING_OVERLAP_TOKENS` 生效,且没有造成内容膨胀
- embedding 调用是否批量且可测试。
- 语义断点失败时是否有 fallback。

Expand Down
3 changes: 2 additions & 1 deletion docs/ops/configure.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@
| --- | --- | --- |
| `CHUNKING_MIN_CHUNK_TOKENS` | 150 | 短文档可减小 |
| `CHUNKING_MAX_CHUNK_TOKENS` | 512 | 长上下文模型可加大 |
| `CHUNKING_OVERLAP_TOKENS` | 64 | 提升召回时加大 |
| `CHUNKING_OVERLAP_ENABLED` | `true` | 是否启用相邻 chunk overlap |
| `CHUNKING_OVERLAP_TOKENS` | 64 | overlap token 数,范围 `0..64` |
| `CHUNKING_HEADING_BREAK_LEVEL` | 3 | 提升结构敏感性时减小 |
| `CHUNKING_SEMANTIC_PERCENTILE` | 95 | 调整语义边界严格度 |
| `CHUNKING_SEMANTIC_UNIT` | `sentence` | 语义相似度计算粒度:`sentence` / `paragraph` |
Expand Down
8 changes: 8 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def assemble_redis_url(cls, v: Optional[str], info) -> str:
CHUNKING_SEMANTIC_UNIT: str = "sentence"
CHUNKING_MIN_CHUNK_TOKENS: int = 150
CHUNKING_MAX_CHUNK_TOKENS: int = 512
CHUNKING_OVERLAP_ENABLED: bool = True
CHUNKING_OVERLAP_TOKENS: int = 64
CHUNKING_MIN_DISTANCE_GATE: float = 0.25
CHUNKING_EMBED_BATCH_SIZE: int = 32
Expand All @@ -124,6 +125,13 @@ def validate_chunking_semantic_unit(cls, v: str) -> str:
raise ValueError("CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'")
return normalized

@field_validator("CHUNKING_OVERLAP_TOKENS")
@classmethod
def validate_chunking_overlap_tokens(cls, v: int) -> int:
if v < 0 or v > 64:
raise ValueError("CHUNKING_OVERLAP_TOKENS must be between 0 and 64")
return v

# ==========================================
# 向量数据库配置 (Vector Store)
# ==========================================
Expand Down
11 changes: 7 additions & 4 deletions src/core/splitter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@
Chunk — 分片数据模型
"""

from .models import Chunk, EmbeddedChunk, EmbeddingPipelineStats
from .base import BaseChunker
from .chunking_engine import ChunkingEngine
from .rule_chunker import ASTAwareChunker
from .pipeline_chunker import StructuredSemanticChunker
from .semantic_chunker import PercentileSemanticChunker, SemanticSplitter
from .embedding_pipeline import ChunkEmbeddingPipeline
from .factory import (
LazyEmbeddingClient,
Expand All @@ -24,6 +20,11 @@
create_lazy_system_embedding_client,
create_system_embedding_client,
)
from .models import Chunk, EmbeddedChunk, EmbeddingPipelineStats
from .overlap import ChunkOverlapConfig, ChunkOverlapper
from .pipeline_chunker import StructuredSemanticChunker
from .rule_chunker import ASTAwareChunker
from .semantic_chunker import PercentileSemanticChunker, SemanticSplitter

__all__ = [
"Chunk",
Expand All @@ -32,6 +33,8 @@
"BaseChunker",
"ChunkingEngine",
"ASTAwareChunker",
"ChunkOverlapConfig",
"ChunkOverlapper",
"StructuredSemanticChunker",
"PercentileSemanticChunker",
"SemanticSplitter",
Expand Down
1 change: 1 addition & 0 deletions src/core/splitter/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def create_chunking_engine() -> ChunkingEngine:
semantic_unit=settings.CHUNKING_SEMANTIC_UNIT,
min_chunk_tokens=settings.CHUNKING_MIN_CHUNK_TOKENS,
max_chunk_tokens=settings.CHUNKING_MAX_CHUNK_TOKENS,
overlap_enabled=settings.CHUNKING_OVERLAP_ENABLED,
overlap_tokens=settings.CHUNKING_OVERLAP_TOKENS,
min_distance_gate=settings.CHUNKING_MIN_DISTANCE_GATE,
)
Expand Down
136 changes: 136 additions & 0 deletions src/core/splitter/overlap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""Chunk overlap 配置与文本上下文处理工具。"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from src.core.llm.tokenizer import Tokenizer
else:
Tokenizer = Any


@dataclass(slots=True)
class ChunkOverlapConfig:
"""描述 chunk overlap 的独立配置。"""

enabled: bool = True
tokens: int = 64

def __post_init__(self) -> None:
if self.tokens < 0 or self.tokens > 64:
raise ValueError("overlap tokens must be between 0 and 64.")


class ChunkOverlapper:
"""集中处理 chunk overlap 的 token 截取与上下文拼接。"""

def __init__(
self,
tokenizer: Tokenizer,
config: ChunkOverlapConfig | None = None,
) -> None:
self.tokenizer = tokenizer
self.config = config or ChunkOverlapConfig()

@property
def effective_tokens(self) -> int:
"""返回当前实际启用的 overlap token 数。"""
if not self.config.enabled:
return 0
return self.config.tokens

def count_tokens(self, text: str) -> int:
"""统计文本 token 数。"""
return self.tokenizer.count_tokens(text.strip()) if text else 0

def take_first_tokens(self, text: str, token_limit: int) -> str:
"""取出文本开头的指定数量 token。"""
if not text or token_limit <= 0:
return ""
truncated, _ = self.tokenizer.truncate_text(text, token_limit)
return truncated.strip()

def take_last_tokens(self, text: str, token_limit: int) -> str:
"""取出文本末尾的指定数量 token。"""
cleaned = text.strip()
if not cleaned or token_limit <= 0:
return ""
if self.count_tokens(cleaned) <= token_limit:
return cleaned

left = 0
right = len(cleaned) - 1
best_start = right

while left <= right:
mid = (left + right) // 2
candidate = cleaned[mid:].lstrip()
tokens = self.count_tokens(candidate)
if tokens <= token_limit:
best_start = mid
right = mid - 1
else:
left = mid + 1

return cleaned[best_start:].lstrip()

def build_next_chunk(
self,
previous_chunk: str,
next_atom: str,
*,
max_chunk_tokens: int,
) -> str:
"""在切分发生时,为下一块追加上一块尾部 overlap。"""
overlap_budget = self.effective_tokens
if overlap_budget <= 0:
return next_atom

next_tokens = self.count_tokens(next_atom)
available_for_overlap = max(0, max_chunk_tokens - next_tokens)
if available_for_overlap <= 0:
return next_atom

overlap_tail = self.take_last_tokens(
previous_chunk,
min(overlap_budget, available_for_overlap),
)
if not overlap_tail:
return next_atom

return f"{overlap_tail}\n\n{next_atom}".strip()

def build_neighbor_context(
self,
*,
previous_content: str | None,
current_content: str,
next_content: str | None,
) -> tuple[str, int, int]:
"""为最终 chunk 构造相邻上下文,并返回实际追加的前后 token 数。"""
overlap_budget = self.effective_tokens
if overlap_budget <= 0:
return current_content, 0, 0

contextual_parts: list[str] = []
previous_tokens = 0
next_tokens = 0

if previous_content:
previous_context = self.take_last_tokens(previous_content, overlap_budget)
if previous_context:
previous_tokens = self.count_tokens(previous_context)
contextual_parts.append(previous_context)

contextual_parts.append(current_content)

if next_content:
next_context = self.take_first_tokens(next_content, overlap_budget)
if next_context:
next_tokens = self.count_tokens(next_context)
contextual_parts.append(next_context)

return "\n\n".join(contextual_parts).strip(), previous_tokens, next_tokens
33 changes: 7 additions & 26 deletions src/core/splitter/pipeline_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,38 +151,19 @@ def _apply_neighbor_context(self, chunks: list[Chunk]) -> list[Chunk]:
Returns:
list[Chunk]: 追加邻接上下文后的 Chunk 列表。
"""
overlap_budget = self.semantic_chunker._resolve_overlap_tokens()
if overlap_budget <= 0 or len(chunks) <= 1:
if self.semantic_chunker.overlapper.effective_tokens <= 0 or len(chunks) <= 1:
return chunks

base_contents = [chunk.content for chunk in chunks]

for index, chunk in enumerate(chunks):
contextual_parts: list[str] = []
previous_tokens = 0
next_tokens = 0

if index > 0:
previous_context = self.semantic_chunker._take_last_tokens(
base_contents[index - 1],
overlap_budget,
chunk.content, previous_tokens, next_tokens = (
self.semantic_chunker.overlapper.build_neighbor_context(
previous_content=base_contents[index - 1] if index > 0 else None,
current_content=base_contents[index],
next_content=base_contents[index + 1] if index + 1 < len(chunks) else None,
)
if previous_context:
previous_tokens = self.semantic_chunker.tokenizer.count_tokens(previous_context)
contextual_parts.append(previous_context)

contextual_parts.append(base_contents[index])

if index + 1 < len(chunks):
next_context = self.semantic_chunker._take_first_tokens(
base_contents[index + 1],
overlap_budget,
)
if next_context:
next_tokens = self.semantic_chunker.tokenizer.count_tokens(next_context)
contextual_parts.append(next_context)

chunk.content = "\n\n".join(contextual_parts).strip()
)
if previous_tokens > 0:
chunk.metadata["context_prev_tokens_applied"] = previous_tokens
if next_tokens > 0:
Expand Down
Loading
Loading