diff --git a/.env.example b/.env.example index f1b8102..0ff38c0 100644 --- a/.env.example +++ b/.env.example @@ -53,6 +53,9 @@ CHUNKING_SEMANTIC_PERCENTILE=95 CHUNKING_SEMANTIC_UNIT=sentence CHUNKING_MIN_CHUNK_TOKENS=150 CHUNKING_MAX_CHUNK_TOKENS=512 +# 是否启用相邻 chunk overlap;关闭后 CHUNKING_OVERLAP_TOKENS 不生效 +CHUNKING_OVERLAP_ENABLED=true +# overlap token 数允许范围:0-64 CHUNKING_OVERLAP_TOKENS=64 CHUNKING_MIN_DISTANCE_GATE=0.25 CHUNKING_EMBED_BATCH_SIZE=32 diff --git a/docs/internals/chunking.md b/docs/internals/chunking.md index b2c313e..af01836 100644 --- a/docs/internals/chunking.md +++ b/docs/internals/chunking.md @@ -11,6 +11,7 @@ src/core/splitter/ ├── chunking_engine.py # Markdown 解析与分片编排入口 ├── rule_chunker.py # 基于 Markdown AST 的规则分片 ├── semantic_chunker.py # 基于 embedding 距离的语义细分 +├── overlap.py # chunk overlap 配置与上下文拼接 ├── pipeline_chunker.py # 结构分片 + 语义细分两阶段分片器 └── embedding_pipeline.py # Chunk 向量化批处理管线 ``` @@ -88,13 +89,24 @@ class BaseChunker(ABC): - 先按 `semantic_unit` 配置把文本拆成语义比较原子;默认 `sentence` 保持原有段落、行、句子逐级降级行为,`paragraph` 则以段落作为相似度计算单位。 - 调用 embedding 模型计算相邻原子的语义距离。 - 使用距离分位数作为动态阈值寻找断点。 -- 受 `min_chunk_tokens`、`max_chunk_tokens`、`overlap_tokens` 控制。 +- 受 `min_chunk_tokens`、`max_chunk_tokens` 控制;overlap 由独立配置控制,但仍在原切分位置追加,保证算法流程不变。 `paragraph` 模式只改变相似度计算粒度:单个段落超过 `max_chunk_tokens` 时,不会再改用句子级 embedding 计算断点,但最终输出仍会做长度保底拆分,避免生成超长 Chunk。 它通常不直接作为主分片器使用,而是被 `StructuredSemanticChunker` 注入。 -### 3.3 StructuredSemanticChunker +### 3.3 ChunkOverlapper + +`ChunkOverlapper` 负责相邻 Chunk 的上下文 overlap,不参与语义断点计算。 + +配置: + +- `CHUNKING_OVERLAP_ENABLED`:是否启用 overlap。 +- `CHUNKING_OVERLAP_TOKENS`:启用后追加的 token 数上限,范围 `0..64`。 + +`CHUNKING_OVERLAP_ENABLED=false` 或 `CHUNKING_OVERLAP_TOKENS=0` 时,不追加 overlap。默认 `true + 64` 保持现有分片行为。 + +### 3.4 StructuredSemanticChunker `StructuredSemanticChunker` 是两阶段分片器: @@ -201,7 +213,7 @@ chunks = engine.process(markdown) 修改语义分片时关注: - token 上下限是否合理。 -- overlap 是否造成内容膨胀。 +- overlap 是否按 `CHUNKING_OVERLAP_ENABLED` 与 `CHUNKING_OVERLAP_TOKENS` 生效,且没有造成内容膨胀。 - embedding 调用是否批量且可测试。 - 语义断点失败时是否有 fallback。 diff --git a/docs/ops/configure.md b/docs/ops/configure.md index ca8f21e..a37a66e 100644 --- a/docs/ops/configure.md +++ b/docs/ops/configure.md @@ -86,7 +86,8 @@ | --- | --- | --- | | `CHUNKING_MIN_CHUNK_TOKENS` | 150 | 短文档可减小 | | `CHUNKING_MAX_CHUNK_TOKENS` | 512 | 长上下文模型可加大 | -| `CHUNKING_OVERLAP_TOKENS` | 64 | 提升召回时加大 | +| `CHUNKING_OVERLAP_ENABLED` | `true` | 是否启用相邻 chunk overlap | +| `CHUNKING_OVERLAP_TOKENS` | 64 | overlap token 数,范围 `0..64` | | `CHUNKING_HEADING_BREAK_LEVEL` | 3 | 提升结构敏感性时减小 | | `CHUNKING_SEMANTIC_PERCENTILE` | 95 | 调整语义边界严格度 | | `CHUNKING_SEMANTIC_UNIT` | `sentence` | 语义相似度计算粒度:`sentence` / `paragraph` | diff --git a/src/config.py b/src/config.py index d4a48a0..199e5b2 100644 --- a/src/config.py +++ b/src/config.py @@ -112,6 +112,7 @@ def assemble_redis_url(cls, v: Optional[str], info) -> str: CHUNKING_SEMANTIC_UNIT: str = "sentence" CHUNKING_MIN_CHUNK_TOKENS: int = 150 CHUNKING_MAX_CHUNK_TOKENS: int = 512 + CHUNKING_OVERLAP_ENABLED: bool = True CHUNKING_OVERLAP_TOKENS: int = 64 CHUNKING_MIN_DISTANCE_GATE: float = 0.25 CHUNKING_EMBED_BATCH_SIZE: int = 32 @@ -124,6 +125,13 @@ def validate_chunking_semantic_unit(cls, v: str) -> str: raise ValueError("CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'") return normalized + @field_validator("CHUNKING_OVERLAP_TOKENS") + @classmethod + def validate_chunking_overlap_tokens(cls, v: int) -> int: + if v < 0 or v > 64: + raise ValueError("CHUNKING_OVERLAP_TOKENS must be between 0 and 64") + return v + # ========================================== # 向量数据库配置 (Vector Store) # ========================================== diff --git a/src/core/splitter/__init__.py b/src/core/splitter/__init__.py index 9a0f4c1..7464446 100644 --- a/src/core/splitter/__init__.py +++ b/src/core/splitter/__init__.py @@ -10,12 +10,8 @@ Chunk — 分片数据模型 """ -from .models import Chunk, EmbeddedChunk, EmbeddingPipelineStats from .base import BaseChunker from .chunking_engine import ChunkingEngine -from .rule_chunker import ASTAwareChunker -from .pipeline_chunker import StructuredSemanticChunker -from .semantic_chunker import PercentileSemanticChunker, SemanticSplitter from .embedding_pipeline import ChunkEmbeddingPipeline from .factory import ( LazyEmbeddingClient, @@ -24,6 +20,11 @@ create_lazy_system_embedding_client, create_system_embedding_client, ) +from .models import Chunk, EmbeddedChunk, EmbeddingPipelineStats +from .overlap import ChunkOverlapConfig, ChunkOverlapper +from .pipeline_chunker import StructuredSemanticChunker +from .rule_chunker import ASTAwareChunker +from .semantic_chunker import PercentileSemanticChunker, SemanticSplitter __all__ = [ "Chunk", @@ -32,6 +33,8 @@ "BaseChunker", "ChunkingEngine", "ASTAwareChunker", + "ChunkOverlapConfig", + "ChunkOverlapper", "StructuredSemanticChunker", "PercentileSemanticChunker", "SemanticSplitter", diff --git a/src/core/splitter/factory.py b/src/core/splitter/factory.py index 61c5687..66e4b22 100644 --- a/src/core/splitter/factory.py +++ b/src/core/splitter/factory.py @@ -133,6 +133,7 @@ def create_chunking_engine() -> ChunkingEngine: semantic_unit=settings.CHUNKING_SEMANTIC_UNIT, min_chunk_tokens=settings.CHUNKING_MIN_CHUNK_TOKENS, max_chunk_tokens=settings.CHUNKING_MAX_CHUNK_TOKENS, + overlap_enabled=settings.CHUNKING_OVERLAP_ENABLED, overlap_tokens=settings.CHUNKING_OVERLAP_TOKENS, min_distance_gate=settings.CHUNKING_MIN_DISTANCE_GATE, ) diff --git a/src/core/splitter/overlap.py b/src/core/splitter/overlap.py new file mode 100644 index 0000000..335a4ae --- /dev/null +++ b/src/core/splitter/overlap.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +"""Chunk overlap 配置与文本上下文处理工具。""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from src.core.llm.tokenizer import Tokenizer +else: + Tokenizer = Any + + +@dataclass(slots=True) +class ChunkOverlapConfig: + """描述 chunk overlap 的独立配置。""" + + enabled: bool = True + tokens: int = 64 + + def __post_init__(self) -> None: + if self.tokens < 0 or self.tokens > 64: + raise ValueError("overlap tokens must be between 0 and 64.") + + +class ChunkOverlapper: + """集中处理 chunk overlap 的 token 截取与上下文拼接。""" + + def __init__( + self, + tokenizer: Tokenizer, + config: ChunkOverlapConfig | None = None, + ) -> None: + self.tokenizer = tokenizer + self.config = config or ChunkOverlapConfig() + + @property + def effective_tokens(self) -> int: + """返回当前实际启用的 overlap token 数。""" + if not self.config.enabled: + return 0 + return self.config.tokens + + def count_tokens(self, text: str) -> int: + """统计文本 token 数。""" + return self.tokenizer.count_tokens(text.strip()) if text else 0 + + def take_first_tokens(self, text: str, token_limit: int) -> str: + """取出文本开头的指定数量 token。""" + if not text or token_limit <= 0: + return "" + truncated, _ = self.tokenizer.truncate_text(text, token_limit) + return truncated.strip() + + def take_last_tokens(self, text: str, token_limit: int) -> str: + """取出文本末尾的指定数量 token。""" + cleaned = text.strip() + if not cleaned or token_limit <= 0: + return "" + if self.count_tokens(cleaned) <= token_limit: + return cleaned + + left = 0 + right = len(cleaned) - 1 + best_start = right + + while left <= right: + mid = (left + right) // 2 + candidate = cleaned[mid:].lstrip() + tokens = self.count_tokens(candidate) + if tokens <= token_limit: + best_start = mid + right = mid - 1 + else: + left = mid + 1 + + return cleaned[best_start:].lstrip() + + def build_next_chunk( + self, + previous_chunk: str, + next_atom: str, + *, + max_chunk_tokens: int, + ) -> str: + """在切分发生时,为下一块追加上一块尾部 overlap。""" + overlap_budget = self.effective_tokens + if overlap_budget <= 0: + return next_atom + + next_tokens = self.count_tokens(next_atom) + available_for_overlap = max(0, max_chunk_tokens - next_tokens) + if available_for_overlap <= 0: + return next_atom + + overlap_tail = self.take_last_tokens( + previous_chunk, + min(overlap_budget, available_for_overlap), + ) + if not overlap_tail: + return next_atom + + return f"{overlap_tail}\n\n{next_atom}".strip() + + def build_neighbor_context( + self, + *, + previous_content: str | None, + current_content: str, + next_content: str | None, + ) -> tuple[str, int, int]: + """为最终 chunk 构造相邻上下文,并返回实际追加的前后 token 数。""" + overlap_budget = self.effective_tokens + if overlap_budget <= 0: + return current_content, 0, 0 + + contextual_parts: list[str] = [] + previous_tokens = 0 + next_tokens = 0 + + if previous_content: + previous_context = self.take_last_tokens(previous_content, overlap_budget) + if previous_context: + previous_tokens = self.count_tokens(previous_context) + contextual_parts.append(previous_context) + + contextual_parts.append(current_content) + + if next_content: + next_context = self.take_first_tokens(next_content, overlap_budget) + if next_context: + next_tokens = self.count_tokens(next_context) + contextual_parts.append(next_context) + + return "\n\n".join(contextual_parts).strip(), previous_tokens, next_tokens diff --git a/src/core/splitter/pipeline_chunker.py b/src/core/splitter/pipeline_chunker.py index 6cae0f0..5423db0 100644 --- a/src/core/splitter/pipeline_chunker.py +++ b/src/core/splitter/pipeline_chunker.py @@ -151,38 +151,19 @@ def _apply_neighbor_context(self, chunks: list[Chunk]) -> list[Chunk]: Returns: list[Chunk]: 追加邻接上下文后的 Chunk 列表。 """ - overlap_budget = self.semantic_chunker._resolve_overlap_tokens() - if overlap_budget <= 0 or len(chunks) <= 1: + if self.semantic_chunker.overlapper.effective_tokens <= 0 or len(chunks) <= 1: return chunks base_contents = [chunk.content for chunk in chunks] for index, chunk in enumerate(chunks): - contextual_parts: list[str] = [] - previous_tokens = 0 - next_tokens = 0 - - if index > 0: - previous_context = self.semantic_chunker._take_last_tokens( - base_contents[index - 1], - overlap_budget, + chunk.content, previous_tokens, next_tokens = ( + self.semantic_chunker.overlapper.build_neighbor_context( + previous_content=base_contents[index - 1] if index > 0 else None, + current_content=base_contents[index], + next_content=base_contents[index + 1] if index + 1 < len(chunks) else None, ) - if previous_context: - previous_tokens = self.semantic_chunker.tokenizer.count_tokens(previous_context) - contextual_parts.append(previous_context) - - contextual_parts.append(base_contents[index]) - - if index + 1 < len(chunks): - next_context = self.semantic_chunker._take_first_tokens( - base_contents[index + 1], - overlap_budget, - ) - if next_context: - next_tokens = self.semantic_chunker.tokenizer.count_tokens(next_context) - contextual_parts.append(next_context) - - chunk.content = "\n\n".join(contextual_parts).strip() + ) if previous_tokens > 0: chunk.metadata["context_prev_tokens_applied"] = previous_tokens if next_tokens > 0: diff --git a/src/core/splitter/semantic_chunker.py b/src/core/splitter/semantic_chunker.py index 5451c2e..88b4e08 100644 --- a/src/core/splitter/semantic_chunker.py +++ b/src/core/splitter/semantic_chunker.py @@ -9,6 +9,8 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, List, Sequence +from .overlap import ChunkOverlapConfig, ChunkOverlapper + if TYPE_CHECKING: from src.core.llm.interfaces import IEmbedder from src.core.llm.tokenizer import Tokenizer @@ -60,8 +62,10 @@ def __init__( percentile: float = 95.0, min_chunk_tokens: int = 150, max_chunk_tokens: int = 512, - overlap_tokens: int = 50, + overlap_tokens: int = 64, overlap_percentage: float | None = None, + overlap_enabled: bool = True, + overlapper: ChunkOverlapper | None = None, min_distance_gate: float = 0.25, semantic_unit: str = "sentence", ): @@ -74,8 +78,10 @@ def __init__( percentile: 动态阈值使用的距离分位数,默认取 95。 min_chunk_tokens: 允许执行语义断点前的最小 Chunk token 数。 max_chunk_tokens: 单个 Chunk 的最大 token 数上限。 - overlap_tokens: 相邻 Chunk 的固定 token overlap 上限。 - overlap_percentage: 可选的 overlap 百分比配置;当 `overlap_tokens` 为 0 时启用。 + overlap_tokens: 相邻 Chunk 的固定 token overlap 上限,允许范围为 0 到 64。 + overlap_percentage: 兼容旧调用的百分比配置;仅在 `overlap_tokens` 为 0 时换算。 + overlap_enabled: 是否启用 overlap。 + overlapper: 可选的独立 overlap 处理器;传入后优先使用该实例。 min_distance_gate: 绝对最小语义距离阈值,用于避免过度切分。 semantic_unit: 语义相似度计算粒度,支持 `sentence` 或 `paragraph`。 @@ -91,8 +97,6 @@ def __init__( raise ValueError("max_chunk_tokens must be positive.") if min_chunk_tokens > max_chunk_tokens: raise ValueError("min_chunk_tokens cannot exceed max_chunk_tokens.") - if overlap_tokens < 0: - raise ValueError("overlap_tokens cannot be negative.") if overlap_percentage is not None and not 0 <= overlap_percentage < 1: raise ValueError("overlap_percentage must be in [0, 1).") if min_distance_gate < 0: @@ -106,28 +110,22 @@ def __init__( self.percentile = percentile self.min_chunk_tokens = min_chunk_tokens self.max_chunk_tokens = max_chunk_tokens - self.overlap_tokens = overlap_tokens - self.overlap_percentage = overlap_percentage + if overlapper is None: + resolved_overlap_tokens = overlap_tokens + if resolved_overlap_tokens == 0 and overlap_percentage is not None: + resolved_overlap_tokens = int(max_chunk_tokens * overlap_percentage) + overlapper = ChunkOverlapper( + tokenizer=tokenizer, + config=ChunkOverlapConfig( + enabled=overlap_enabled, + tokens=resolved_overlap_tokens, + ), + ) + self.overlapper = overlapper self.min_distance_gate = min_distance_gate self.semantic_unit = semantic_unit self.last_stats = SemanticChunkingStats() - def _resolve_overlap_tokens(self) -> int: - """ - 统一解析 overlap 配置,优先使用显式 token 数,其次回退到百分比配置。 - - Args: - None. - - Returns: - int: 当前配置下应使用的 overlap token 数。 - """ - if self.overlap_tokens > 0: - return self.overlap_tokens - if self.overlap_percentage is None: - return 0 - return max(0, int(self.max_chunk_tokens * self.overlap_percentage)) - def _count_tokens(self, text: str) -> int: """ 统计文本的 token 数,并忽略首尾空白字符。 @@ -140,55 +138,6 @@ def _count_tokens(self, text: str) -> int: """ return self.tokenizer.count_tokens(text.strip()) if text else 0 - def _take_first_tokens(self, text: str, token_limit: int) -> str: - """ - 取出文本开头的指定数量 token,并对齐 tokenizer 的截断语义。 - - Args: - text: 需要截取的原始文本。 - token_limit: 允许保留的最大 token 数。 - - Returns: - str: 截取后的头部文本。 - """ - if not text or token_limit <= 0: - return "" - truncated, _ = self.tokenizer.truncate_text(text, token_limit) - return truncated.strip() - - def _take_last_tokens(self, text: str, token_limit: int) -> str: - """ - 取出文本末尾的指定数量 token,用于拼接相邻 Chunk 的 overlap 上下文。 - - Args: - text: 需要截取的原始文本。 - token_limit: 允许保留的最大 token 数。 - - Returns: - str: 截取后的尾部文本。 - """ - cleaned = text.strip() - if not cleaned or token_limit <= 0: - return "" - if self._count_tokens(cleaned) <= token_limit: - return cleaned - - left = 0 - right = len(cleaned) - 1 - best_start = right - - while left <= right: - mid = (left + right) // 2 - candidate = cleaned[mid:].lstrip() - tokens = self._count_tokens(candidate) - if tokens <= token_limit: - best_start = mid - right = mid - 1 - else: - left = mid + 1 - - return cleaned[best_start:].lstrip() - def _split_oversized_text(self, text: str) -> List[str]: """ 对单个仍然过长的原子单元执行保底拆分,避免直接截断造成内容丢失。 @@ -206,7 +155,7 @@ def _split_oversized_text(self, text: str) -> List[str]: pieces: List[str] = [] remaining = cleaned while remaining: - head = self._take_first_tokens(remaining, self.max_chunk_tokens) + head = self.overlapper.take_first_tokens(remaining, self.max_chunk_tokens) if not head: break pieces.append(head) @@ -385,23 +334,11 @@ def _build_next_chunk(self, previous_chunk: str, next_atom: str) -> str: Returns: str: 带有 overlap 前缀的下一块文本。 """ - overlap_budget = self._resolve_overlap_tokens() - if overlap_budget <= 0: - return next_atom - - next_tokens = self._count_tokens(next_atom) - available_for_overlap = max(0, self.max_chunk_tokens - next_tokens) - if available_for_overlap <= 0: - return next_atom - - overlap_tail = self._take_last_tokens( + return self.overlapper.build_next_chunk( previous_chunk, - min(overlap_budget, available_for_overlap), + next_atom, + max_chunk_tokens=self.max_chunk_tokens, ) - if not overlap_tail: - return next_atom - - return f"{overlap_tail}\n\n{next_atom}".strip() def _group_atom_indices( self, diff --git a/tests/unit/core/splitter/test_factory.py b/tests/unit/core/splitter/test_factory.py index 04b2762..f0fc723 100644 --- a/tests/unit/core/splitter/test_factory.py +++ b/tests/unit/core/splitter/test_factory.py @@ -13,9 +13,13 @@ def has_capability(self, capability): def test_create_chunking_engine_should_pass_semantic_unit_from_settings(monkeypatch): monkeypatch.setattr(factory.settings, "CHUNKING_ENABLE_ADVANCED_PIPELINE", True) monkeypatch.setattr(factory.settings, "CHUNKING_SEMANTIC_UNIT", "paragraph") + monkeypatch.setattr(factory.settings, "CHUNKING_OVERLAP_ENABLED", False) + monkeypatch.setattr(factory.settings, "CHUNKING_OVERLAP_TOKENS", 7) monkeypatch.setattr(factory, "create_system_embedding_client", lambda: _FakeEmbedder()) engine = factory.create_chunking_engine() assert isinstance(engine.chunker, StructuredSemanticChunker) assert engine.chunker.semantic_chunker.semantic_unit == "paragraph" + assert engine.chunker.semantic_chunker.overlapper.effective_tokens == 0 + assert engine.chunker.semantic_chunker.overlapper.config.tokens == 7 diff --git a/tests/unit/core/splitter/test_overlap.py b/tests/unit/core/splitter/test_overlap.py new file mode 100644 index 0000000..092a977 --- /dev/null +++ b/tests/unit/core/splitter/test_overlap.py @@ -0,0 +1,32 @@ +from src.core.splitter import ChunkOverlapConfig, ChunkOverlapper + + +class MockWordTokenizer: + def count_tokens(self, text: str) -> int: + return len([part for part in text.split() if part]) + + def truncate_text(self, text: str, max_tokens: int): + words = [part for part in text.split() if part] + if len(words) <= max_tokens: + return " ".join(words), 0 + return " ".join(words[:max_tokens]), len(words) - max_tokens + + +def test_overlapper_should_apply_64_token_upper_bound(): + overlapper = ChunkOverlapper( + tokenizer=MockWordTokenizer(), + config=ChunkOverlapConfig(enabled=True, tokens=64), + ) + previous_chunk = " ".join(f"p{i}" for i in range(70)) + next_atom = " ".join(f"n{i}" for i in range(10)) + + result = overlapper.build_next_chunk( + previous_chunk, + next_atom, + max_chunk_tokens=100, + ) + + overlap_text, next_text = result.split("\n\n") + assert overlap_text == " ".join(f"p{i}" for i in range(6, 70)) + assert next_text == next_atom + assert overlapper.count_tokens(overlap_text) == 64 diff --git a/tests/unit/core/splitter/test_pipeline_chunker.py b/tests/unit/core/splitter/test_pipeline_chunker.py index 60a4ee0..a655b0b 100644 --- a/tests/unit/core/splitter/test_pipeline_chunker.py +++ b/tests/unit/core/splitter/test_pipeline_chunker.py @@ -151,3 +151,56 @@ async def test_aprocess_should_run_rule_then_semantic_pipeline(): for chunk in chunks: assert chunk.metadata["source_file"] == "override.md" + + +async def test_aprocess_should_not_apply_neighbor_context_when_overlap_disabled(): + elements = [ + MarkdownElement( + type=ElementType.HEADING, + content="# Intro", + start_line=0, + end_line=0, + metadata={"heading_level": 1, "heading_text": "Intro"}, + ), + MarkdownElement( + type=ElementType.PARAGRAPH, + content="before table", + start_line=2, + end_line=2, + ), + MarkdownElement( + type=ElementType.TABLE, + content="| a | b |\n|---|---|\n| 1 | 2 |", + start_line=4, + end_line=6, + ), + MarkdownElement( + type=ElementType.PARAGRAPH, + content="after table", + start_line=8, + end_line=8, + ), + ] + parse_result = ParseResult( + elements=elements, + tables=[], + images=[], + source_file="mock-doc.md", + ) + + semantic_chunker = PercentileSemanticChunker( + embedder=StaticEmbedder([]), + tokenizer=MockWordTokenizer(), + min_chunk_tokens=1, + max_chunk_tokens=20, + overlap_enabled=False, + overlap_tokens=2, + ) + chunker = StructuredSemanticChunker(semantic_chunker=semantic_chunker) + engine = ChunkingEngine(chunker=chunker, parser=FakeParser(parse_result)) + + chunks = await engine.aprocess("ignored") + + assert len(chunks) == 3 + assert chunks[1].content == "| a | b |\n|---|---|\n| 1 | 2 |" + assert "context_overlap_mode" not in chunks[1].metadata diff --git a/tests/unit/core/splitter/test_semantic_chunker.py b/tests/unit/core/splitter/test_semantic_chunker.py index 2f801c6..d3cb423 100644 --- a/tests/unit/core/splitter/test_semantic_chunker.py +++ b/tests/unit/core/splitter/test_semantic_chunker.py @@ -149,6 +149,67 @@ async def test_split_should_force_max_token_break_and_preserve_overlap(): ] +async def test_split_should_disable_overlap_when_configured(): + tokenizer = MockWordTokenizer() + embedder = StaticEmbedder( + [ + [1.0, 0.0], + [1.0, 0.0], + [1.0, 0.0], + ] + ) + chunker = PercentileSemanticChunker( + embedder=embedder, + tokenizer=tokenizer, + percentile=95, + min_chunk_tokens=1, + max_chunk_tokens=5, + overlap_enabled=False, + overlap_tokens=2, + min_distance_gate=0.9, + ) + + text = "\n\n".join( + [ + "a1 a2 a3", + "b1 b2 b3", + "c1 c2 c3", + ] + ) + + chunks = await chunker.split(text) + + assert chunks == [ + "a1 a2 a3", + "b1 b2 b3", + "c1 c2 c3", + ] + + +async def test_split_should_allow_overlap_token_upper_bound(): + tokenizer = MockWordTokenizer() + embedder = StaticEmbedder( + [ + [1.0, 0.0], + [1.0, 0.0], + ] + ) + chunker = PercentileSemanticChunker( + embedder=embedder, + tokenizer=tokenizer, + percentile=95, + min_chunk_tokens=1, + max_chunk_tokens=80, + overlap_tokens=64, + min_distance_gate=0.9, + ) + + chunks = await chunker.split(" ".join(f"a{i}" for i in range(40)) + "\n\nb1 b2") + + assert chunks == [" ".join(f"a{i}" for i in range(40)) + "\n\nb1 b2"] + assert chunker.overlapper.effective_tokens == 64 + + async def test_split_should_fallback_to_length_only_when_embedding_fails(): tokenizer = MockWordTokenizer() chunker = PercentileSemanticChunker( @@ -251,3 +312,16 @@ def test_splitter_should_reject_invalid_semantic_unit(): assert "semantic_unit must be one of" in str(exc) else: raise AssertionError("expected ValueError") + + +def test_splitter_should_reject_overlap_tokens_outside_supported_range(): + try: + PercentileSemanticChunker( + embedder=FailingEmbedder(), + tokenizer=MockWordTokenizer(), + overlap_tokens=65, + ) + except ValueError as exc: + assert "overlap tokens must be between 0 and 64" in str(exc) + else: + raise AssertionError("expected ValueError") diff --git a/tests/unit/test_config_sparse_vector.py b/tests/unit/test_config_sparse_vector.py index 7cd9d5c..448e15b 100644 --- a/tests/unit/test_config_sparse_vector.py +++ b/tests/unit/test_config_sparse_vector.py @@ -22,3 +22,26 @@ def test_should_reject_invalid_chunking_semantic_unit(): assert "CHUNKING_SEMANTIC_UNIT must be 'sentence' or 'paragraph'" in str(exc) else: raise AssertionError("expected ValueError") + + +def test_should_allow_chunking_overlap_token_bounds(): + disabled = Settings( + _env_file=None, + CHUNKING_OVERLAP_ENABLED=False, + CHUNKING_OVERLAP_TOKENS=0, + ) + upper_bound = Settings(_env_file=None, CHUNKING_OVERLAP_TOKENS=64) + + assert disabled.CHUNKING_OVERLAP_ENABLED is False + assert disabled.CHUNKING_OVERLAP_TOKENS == 0 + assert upper_bound.CHUNKING_OVERLAP_TOKENS == 64 + + +def test_should_reject_invalid_chunking_overlap_tokens(): + for value in (-1, 65): + try: + Settings(_env_file=None, CHUNKING_OVERLAP_TOKENS=value) + except ValueError as exc: + assert "CHUNKING_OVERLAP_TOKENS must be between 0 and 64" in str(exc) + else: + raise AssertionError("expected ValueError")