Skip to content

Commit 0f15cd0

Browse files
committed
feat: confidence-gated fuzzy chunk matching with PQ segment store
Add first-class fuzzy chunk matching to SemBlend with quality guarantees via three-tier confidence gating (fast_reuse/verified_reuse/recompute), position-aware bathtub curve, and PQ-compressed segment embeddings. Core changes: - FuzzyMatchConfig: fully configurable per-model/engine confidence scoring with overlap ratio, positional coherence, configurable decay functions (exponential/linear/step), and bag-cosine verification - ChunkConfidence: per-chunk metadata with tiered reuse decisions - PQSegmentStore: product-quantized segment embedding store (32x compression, ~137MB at 100K donors vs 4.4GB naive float32) - RecomputeConfig: configurable layer recomputation with force/skip lists, max recompute cap, and fuzzy deviation boost - Position-aware bathtub curve with per-model sensitivity factors - embed_with_segments() for KV-aligned chunk embeddings - Fuzzy matching now ON by default (SEMBLEND_FUZZY_CHUNKS=1) Tests: 99 passed (36 new + 63 existing, all backward compatible) Signed-off-by: Zach Bennett <zach@worldflowai.com>
1 parent d8c1bc6 commit 0f15cd0

15 files changed

Lines changed: 1771 additions & 20 deletions

benchmarks/suite/fuzzy_tables.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
"""Fuzzy chunk matching benchmark table configurations (Tables 19-27).
2+
3+
Maps each fuzzy matching paper table to its benchmark config.
4+
These extend the existing PAPER_TABLES dict in paper_tables.py.
5+
6+
NOTE: Do NOT execute these benchmarks without explicit go-ahead.
7+
Benchmark runs are managed separately.
8+
"""
9+
from __future__ import annotations
10+
11+
from dataclasses import dataclass, field
12+
from enum import Enum
13+
14+
15+
class Priority(Enum):
16+
P0 = "p0"
17+
P1 = "p1"
18+
P2 = "p2"
19+
20+
21+
class Engine(Enum):
22+
VLLM_LMCACHE = "vllm+lmcache"
23+
24+
25+
@dataclass(frozen=True)
26+
class TableConfig:
27+
table_number: int
28+
title: str
29+
priority: Priority
30+
engine: Engine
31+
models: tuple[str, ...]
32+
script: str
33+
datasets: tuple[str, ...] = ()
34+
n_samples: int = 100
35+
context_lengths: tuple[int, ...] = ()
36+
extra_args: dict[str, str] = field(default_factory=dict)
37+
description: str = ""
38+
notes: str = ""
39+
40+
41+
QWEN_AWQ = "Qwen/Qwen2.5-7B-Instruct-AWQ"
42+
LLAMA_AWQ = "meta-llama/Llama-3.1-8B-Instruct-AWQ"
43+
44+
45+
FUZZY_TABLES: dict[int, TableConfig] = {
46+
19: TableConfig(
47+
table_number=19,
48+
title="Fuzzy Matching Recovery: Exact vs Fuzzy Alignment Reuse",
49+
priority=Priority.P0,
50+
engine=Engine.VLLM_LMCACHE,
51+
models=(QWEN_AWQ, LLAMA_AWQ),
52+
script="e2e/fuzzy_recovery_bench.py",
53+
datasets=("shifted_prefix_xsum", "shifted_prefix_cnn"),
54+
n_samples=200,
55+
context_lengths=(2048, 4096, 8192, 16384),
56+
extra_args={
57+
"SEMBLEND_FUZZY_CHUNKS": "1",
58+
"SEMBLEND_FUZZY_CHUNK_OVERLAP": "0.90",
59+
},
60+
description=(
61+
"Shows fuzzy matching recovers 90%+ reuse in shifted-prefix "
62+
"scenarios where exact matching gets 0%."
63+
),
64+
),
65+
20: TableConfig(
66+
table_number=20,
67+
title="Fuzzy Matching TTFT Speedup vs Exact-Only",
68+
priority=Priority.P0,
69+
engine=Engine.VLLM_LMCACHE,
70+
models=(QWEN_AWQ,),
71+
script="e2e/fuzzy_ttft_bench.py",
72+
datasets=("shifted_prefix_xsum", "minor_edit_cnn"),
73+
n_samples=100,
74+
context_lengths=(2048, 4096, 8192, 16384),
75+
extra_args={"compare_exact": "1"},
76+
description=(
77+
"Measures additional TTFT speedup from fuzzy matching "
78+
"over exact-only across shifted prefix and minor edit scenarios."
79+
),
80+
),
81+
21: TableConfig(
82+
table_number=21,
83+
title="PPL by Confidence Threshold (Sweep)",
84+
priority=Priority.P0,
85+
engine=Engine.VLLM_LMCACHE,
86+
models=(QWEN_AWQ, LLAMA_AWQ),
87+
script="e2e/fuzzy_confidence_ppl_bench.py",
88+
datasets=("shifted_prefix_xsum", "shifted_prefix_cnn", "cross_instruction_rag"),
89+
n_samples=160,
90+
context_lengths=(4096, 8192),
91+
extra_args={
92+
"confidence_sweep": "0.70,0.80,0.85,0.90,0.95",
93+
},
94+
description=(
95+
"Quality-coverage tradeoff: PPL ratio and hit rate "
96+
"at various confidence thresholds."
97+
),
98+
),
99+
22: TableConfig(
100+
table_number=22,
101+
title="Confidence Scoring Component Ablation",
102+
priority=Priority.P0,
103+
engine=Engine.VLLM_LMCACHE,
104+
models=(QWEN_AWQ,),
105+
script="e2e/confidence_component_bench.py",
106+
datasets=("shifted_prefix_xsum",),
107+
n_samples=200,
108+
extra_args={"component_ablation": "1"},
109+
description=(
110+
"Incremental contribution of each confidence component: "
111+
"overlap-only, +position-delta, +bag-cosine, +segment-similarity."
112+
),
113+
),
114+
23: TableConfig(
115+
table_number=23,
116+
title="CacheBlend Verification for Fuzzy Matches",
117+
priority=Priority.P1,
118+
engine=Engine.VLLM_LMCACHE,
119+
models=(QWEN_AWQ,),
120+
script="e2e/fuzzy_cacheblend_bench.py",
121+
datasets=("shifted_prefix_xsum", "minor_edit_cnn"),
122+
n_samples=100,
123+
extra_args={"cacheblend_sweep": "1"},
124+
description=(
125+
"CacheBlend layer verification impact on PPL/TTFT "
126+
"across match confidence tiers."
127+
),
128+
),
129+
24: TableConfig(
130+
table_number=24,
131+
title="Fuzzy Hit Rate by Scenario Type",
132+
priority=Priority.P1,
133+
engine=Engine.VLLM_LMCACHE,
134+
models=(QWEN_AWQ,),
135+
script="e2e/fuzzy_scenario_hitrate_bench.py",
136+
datasets=(
137+
"shifted_prefix_xsum", "minor_edit_cnn",
138+
"same_topic_multinews", "multiturn_wildchat",
139+
"cross_instruction_rag",
140+
),
141+
n_samples=100,
142+
extra_args={"scenario_breakdown": "1"},
143+
description=(
144+
"Hit rate comparison across scenario types: shifted prefix, "
145+
"minor edit, same-topic different input, multi-turn, cross-instruction."
146+
),
147+
),
148+
25: TableConfig(
149+
table_number=25,
150+
title="PQ Segment Embedding Store Scalability",
151+
priority=Priority.P1,
152+
engine=Engine.VLLM_LMCACHE,
153+
models=(QWEN_AWQ,),
154+
script="e2e/segment_scalability_bench.py",
155+
datasets=("shifted_prefix_xsum",),
156+
n_samples=50,
157+
extra_args={"donor_scales": "100,1000,10000,100000"},
158+
description=(
159+
"PQ segment store overhead: lookup latency, memory footprint, "
160+
"and pipeline latency at 100 to 100K donors."
161+
),
162+
),
163+
26: TableConfig(
164+
table_number=26,
165+
title="Position Delta Decay Function Ablation",
166+
priority=Priority.P2,
167+
engine=Engine.VLLM_LMCACHE,
168+
models=(QWEN_AWQ,),
169+
script="e2e/position_decay_ablation_bench.py",
170+
datasets=("minor_edit_cnn",),
171+
n_samples=200,
172+
extra_args={"decay_sweep": "exponential,linear,step,none"},
173+
description=(
174+
"Comparison of decay functions: exponential, linear, step, "
175+
"and no decay for position delta confidence."
176+
),
177+
),
178+
27: TableConfig(
179+
table_number=27,
180+
title="Full Ablation Matrix (8 Configurations)",
181+
priority=Priority.P0,
182+
engine=Engine.VLLM_LMCACHE,
183+
models=(QWEN_AWQ, LLAMA_AWQ),
184+
script="e2e/fuzzy_ablation_matrix_bench.py",
185+
datasets=("shifted_prefix_xsum", "minor_edit_cnn"),
186+
n_samples=100,
187+
extra_args={"full_ablation": "1"},
188+
description=(
189+
"8-config ablation: baseline, +fuzzy, +confidence, +segment, "
190+
"+cacheblend, +conf+segment, +conf+cacheblend, full stack."
191+
),
192+
notes=(
193+
"Configs: (1) exact-only, (2) +fuzzy no gating, "
194+
"(3) +confidence gating, (4) +segment verify, "
195+
"(5) +CacheBlend verify, (6) +conf+segment, "
196+
"(7) +conf+CacheBlend, (8) full (all features)."
197+
),
198+
),
199+
}

0 commit comments

Comments
 (0)