Add Semantic Density Chunker (SDC) for improved chunking

m1rl0k · m1rl0k · commit ad566d4b11dd · 2026-01-23T23:34:32.000-05:00
Introduces a new token-aware, AST-driven chunking algorithm (Semantic Density Chunker) in scripts/ingest/semantic_chunker.py. Integrates SDC into chunking.py and pipeline.py, allowing selection via the INDEX_SDC_CHUNKS environment variable. The new chunker uses token budgets, respects semantic boundaries, merges small units, and scores chunks by information density for more effective code chunking.
diff --git a/scripts/ingest/chunking.py b/scripts/ingest/chunking.py
@@ -21,6 +21,13 @@
 except ImportError:
     _AST_ANALYZER_AVAILABLE = False
 
+# Import Semantic Density Chunker (SDC) - our improved token-aware chunking
+try:
+    from scripts.ingest.semantic_chunker import chunk_semantic_density, SDCConfig
+    _SDC_AVAILABLE = True
+except ImportError:
+    _SDC_AVAILABLE = False
+
 
 # Cache tokenizers loaded from TOKENIZER_JSON (or default) to avoid repeatedly
 # re-reading tokenizer.json from disk during micro-chunking.
@@ -284,3 +291,47 @@ def char_to_line(c: int) -> int:
             break
         i = i + s if s > 0 else i + 1
     return chunks
+
+
+def chunk_semantic_v2(
+    text: str,
+    language: str,
+    min_tokens: int = 200,
+    target_tokens: int = 800,
+    max_tokens: int = 1500,
+) -> List[Dict]:
+    """
+    Semantic Density Chunking (SDC) - token-aware, AST-driven chunking.
+
+    This is the improved chunking algorithm that:
+    - Uses token budgets instead of line counts
+    - Respects AST boundaries (functions, classes, methods)
+    - Merges small adjacent units for optimal density
+    - Scores chunks by information density
+
+    Args:
+        text: Source code content
+        language: Programming language
+        min_tokens: Minimum tokens per chunk (default: 200)
+        target_tokens: Target tokens per chunk (default: 800)
+        max_tokens: Maximum tokens per chunk (default: 1500)
+
+    Returns:
+        List of chunk dicts with text, start, end, symbol, kind, token_count, density_score
+    """
+    if not _SDC_AVAILABLE:
+        # Fall back to existing semantic chunking
+        return chunk_semantic(text, language)
+
+    # Check for env var overrides
+    min_tokens = int(os.environ.get("SDC_MIN_TOKENS", str(min_tokens)))
+    target_tokens = int(os.environ.get("SDC_TARGET_TOKENS", str(target_tokens)))
+    max_tokens = int(os.environ.get("SDC_MAX_TOKENS", str(max_tokens)))
+
+    config = SDCConfig(
+        min_tokens=min_tokens,
+        target_tokens=target_tokens,
+        max_tokens=max_tokens,
+    )
+
+    return chunk_semantic_density(text, language, config)
diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py
@@ -56,7 +56,7 @@
     iter_files,
     _should_skip_explicit_file_by_excluder,
 )
-from scripts.ingest.chunking import chunk_lines, chunk_semantic, chunk_by_tokens
+from scripts.ingest.chunking import chunk_lines, chunk_semantic, chunk_by_tokens, chunk_semantic_v2
 from scripts.ingest.symbols import (
     _extract_symbols,
     _choose_symbol_for_chunk,
@@ -802,6 +802,8 @@ def _index_single_file_inner(
     CHUNK_OVERLAP = int(os.environ.get("INDEX_CHUNK_OVERLAP", "20") or 20)
     use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in {"1", "true", "yes", "on"}
     use_semantic = os.environ.get("INDEX_SEMANTIC_CHUNKS", "1").lower() in {"1", "true", "yes", "on"}
+    # SDC = Semantic Density Chunker (token-aware, AST-driven chunking)
+    use_sdc = os.environ.get("INDEX_SDC_CHUNKS", "0").lower() in {"1", "true", "yes", "on"}
 
     if use_micro:
         try:
@@ -824,6 +826,9 @@ def _index_single_file_inner(
                     logger.debug(f"Suppressed exception: {e}")
         except Exception:
             chunks = chunk_by_tokens(text)
+    elif use_sdc:
+        # Use Semantic Density Chunker (improved token-aware chunking)
+        chunks = chunk_semantic_v2(text, language)
     elif use_semantic:
         chunks = chunk_semantic(text, language, CHUNK_LINES, CHUNK_OVERLAP)
     else:
diff --git a/scripts/ingest/semantic_chunker.py b/scripts/ingest/semantic_chunker.py