Skip to content

Commit ad566d4

Browse files
committed
Add Semantic Density Chunker (SDC) for improved chunking
Introduces a new token-aware, AST-driven chunking algorithm (Semantic Density Chunker) in scripts/ingest/semantic_chunker.py. Integrates SDC into chunking.py and pipeline.py, allowing selection via the INDEX_SDC_CHUNKS environment variable. The new chunker uses token budgets, respects semantic boundaries, merges small units, and scores chunks by information density for more effective code chunking.
1 parent cfa629f commit ad566d4

3 files changed

Lines changed: 533 additions & 1 deletion

File tree

scripts/ingest/chunking.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121
except ImportError:
2222
_AST_ANALYZER_AVAILABLE = False
2323

24+
# Import Semantic Density Chunker (SDC) - our improved token-aware chunking
25+
try:
26+
from scripts.ingest.semantic_chunker import chunk_semantic_density, SDCConfig
27+
_SDC_AVAILABLE = True
28+
except ImportError:
29+
_SDC_AVAILABLE = False
30+
2431

2532
# Cache tokenizers loaded from TOKENIZER_JSON (or default) to avoid repeatedly
2633
# re-reading tokenizer.json from disk during micro-chunking.
@@ -284,3 +291,47 @@ def char_to_line(c: int) -> int:
284291
break
285292
i = i + s if s > 0 else i + 1
286293
return chunks
294+
295+
296+
def chunk_semantic_v2(
297+
text: str,
298+
language: str,
299+
min_tokens: int = 200,
300+
target_tokens: int = 800,
301+
max_tokens: int = 1500,
302+
) -> List[Dict]:
303+
"""
304+
Semantic Density Chunking (SDC) - token-aware, AST-driven chunking.
305+
306+
This is the improved chunking algorithm that:
307+
- Uses token budgets instead of line counts
308+
- Respects AST boundaries (functions, classes, methods)
309+
- Merges small adjacent units for optimal density
310+
- Scores chunks by information density
311+
312+
Args:
313+
text: Source code content
314+
language: Programming language
315+
min_tokens: Minimum tokens per chunk (default: 200)
316+
target_tokens: Target tokens per chunk (default: 800)
317+
max_tokens: Maximum tokens per chunk (default: 1500)
318+
319+
Returns:
320+
List of chunk dicts with text, start, end, symbol, kind, token_count, density_score
321+
"""
322+
if not _SDC_AVAILABLE:
323+
# Fall back to existing semantic chunking
324+
return chunk_semantic(text, language)
325+
326+
# Check for env var overrides
327+
min_tokens = int(os.environ.get("SDC_MIN_TOKENS", str(min_tokens)))
328+
target_tokens = int(os.environ.get("SDC_TARGET_TOKENS", str(target_tokens)))
329+
max_tokens = int(os.environ.get("SDC_MAX_TOKENS", str(max_tokens)))
330+
331+
config = SDCConfig(
332+
min_tokens=min_tokens,
333+
target_tokens=target_tokens,
334+
max_tokens=max_tokens,
335+
)
336+
337+
return chunk_semantic_density(text, language, config)

scripts/ingest/pipeline.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
iter_files,
5757
_should_skip_explicit_file_by_excluder,
5858
)
59-
from scripts.ingest.chunking import chunk_lines, chunk_semantic, chunk_by_tokens
59+
from scripts.ingest.chunking import chunk_lines, chunk_semantic, chunk_by_tokens, chunk_semantic_v2
6060
from scripts.ingest.symbols import (
6161
_extract_symbols,
6262
_choose_symbol_for_chunk,
@@ -802,6 +802,8 @@ def _index_single_file_inner(
802802
CHUNK_OVERLAP = int(os.environ.get("INDEX_CHUNK_OVERLAP", "20") or 20)
803803
use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in {"1", "true", "yes", "on"}
804804
use_semantic = os.environ.get("INDEX_SEMANTIC_CHUNKS", "1").lower() in {"1", "true", "yes", "on"}
805+
# SDC = Semantic Density Chunker (token-aware, AST-driven chunking)
806+
use_sdc = os.environ.get("INDEX_SDC_CHUNKS", "0").lower() in {"1", "true", "yes", "on"}
805807

806808
if use_micro:
807809
try:
@@ -824,6 +826,9 @@ def _index_single_file_inner(
824826
logger.debug(f"Suppressed exception: {e}")
825827
except Exception:
826828
chunks = chunk_by_tokens(text)
829+
elif use_sdc:
830+
# Use Semantic Density Chunker (improved token-aware chunking)
831+
chunks = chunk_semantic_v2(text, language)
827832
elif use_semantic:
828833
chunks = chunk_semantic(text, language, CHUNK_LINES, CHUNK_OVERLAP)
829834
else:

0 commit comments

Comments
 (0)