Skip to content

Commit fbdabef

Browse files
authored
refactor: split statistics sketch storage and add RocksDB diagnostics (#312)
* refactor: split statistics sketch storage and improve cache diagnostics * chore: bump version to 0.1.8 * chore: codefmt * chore: codefmt
1 parent 506a243 commit fbdabef

File tree

10 files changed

+738
-116
lines changed

10 files changed

+738
-116
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
[package]
44
name = "kite_sql"
5-
version = "0.1.7"
5+
version = "0.1.8"
66
edition = "2021"
77
authors = ["Kould <kould2333@gmail.com>", "Xwg <loloxwg@gmail.com>"]
88
description = "SQL as a Function for Rust"

src/execution/dml/analyze.rs

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,9 @@ impl Analyze {
167167
{
168168
let (histogram, sketch) =
169169
builder.build(histogram_buckets.unwrap_or(DEFAULT_NUM_OF_BUCKETS))?;
170-
let meta = StatisticsMeta::new(histogram, sketch);
170+
let meta = StatisticsMeta::new(histogram);
171171

172-
unsafe { &mut (*transaction) }.save_statistics_meta(cache, table_name, meta)?;
172+
unsafe { &mut (*transaction) }.save_statistics_meta(cache, table_name, meta, sketch)?;
173173
values.push(DataValue::Utf8 {
174174
value: format!("{table_name}/{index_id}"),
175175
ty: Utf8Type::Variable(None),
@@ -196,14 +196,18 @@ mod test {
196196
use crate::db::{DataBaseBuilder, ResultIter};
197197
use crate::errors::DatabaseError;
198198
use crate::execution::dml::analyze::DEFAULT_NUM_OF_BUCKETS;
199+
use crate::expression::range_detacher::Range;
200+
use crate::optimizer::core::cm_sketch::COUNT_MIN_SKETCH_STORAGE_PAGE_LEN;
199201
use crate::storage::{InnerIter, Storage, Transaction};
202+
use crate::types::value::DataValue;
200203
use std::ops::Bound;
201204
use tempfile::TempDir;
202205

203206
#[test]
204207
fn test_analyze() -> Result<(), DatabaseError> {
205208
test_statistics_meta_roundtrip()?;
206209
test_meta_loader_uses_cache()?;
210+
test_meta_loader_negative_cache()?;
207211
test_clean_expired_index()?;
208212

209213
Ok(())
@@ -268,12 +272,16 @@ mod test {
268272
kite_sql.run("analyze table t1")?.done()?;
269273

270274
let table_name = "t1".to_string().into();
271-
let mut transaction = kite_sql.storage.transaction()?;
272-
assert!(transaction
273-
.meta_loader(kite_sql.state.meta_cache())
274-
.load(&table_name, 1)?
275-
.is_some());
275+
let transaction = kite_sql.storage.transaction()?;
276+
let loader = transaction.meta_loader(kite_sql.state.meta_cache());
277+
assert!(loader.load(&table_name, 1)?.is_some());
278+
assert_eq!(
279+
loader.collect_count(&table_name, 1, &Range::Eq(DataValue::Int32(7)))?,
280+
Some(1)
281+
);
282+
drop(transaction);
276283

284+
let mut transaction = kite_sql.storage.transaction()?;
277285
let (min, max) = unsafe { &*transaction.table_codec() }.statistics_index_bound("t1", 1);
278286
let mut iter = transaction.range(Bound::Included(min), Bound::Included(max))?;
279287
let mut keys: Vec<Vec<u8>> = Vec::new();
@@ -285,10 +293,37 @@ mod test {
285293
transaction.remove(&key)?;
286294
}
287295

288-
assert!(transaction
289-
.meta_loader(kite_sql.state.meta_cache())
290-
.load(&table_name, 1)?
291-
.is_some());
296+
let transaction = kite_sql.storage.transaction()?;
297+
let loader = transaction.meta_loader(kite_sql.state.meta_cache());
298+
assert!(loader.load(&table_name, 1)?.is_some());
299+
assert_eq!(
300+
loader.collect_count(&table_name, 1, &Range::Eq(DataValue::Int32(7)))?,
301+
Some(1)
302+
);
303+
304+
Ok(())
305+
}
306+
307+
fn test_meta_loader_negative_cache() -> Result<(), DatabaseError> {
308+
let temp_dir = TempDir::new().expect("unable to create temporary working directory");
309+
let kite_sql = DataBaseBuilder::path(temp_dir.path()).build()?;
310+
311+
kite_sql
312+
.run("create table t1 (a int primary key, b int)")?
313+
.done()?;
314+
kite_sql.run("create index b_index on t1 (b)")?.done()?;
315+
316+
let table_name = "t1".to_string().into();
317+
let transaction = kite_sql.storage.transaction()?;
318+
let loader = transaction.meta_loader(kite_sql.state.meta_cache());
319+
assert!(loader.load(&table_name, 1)?.is_none());
320+
321+
let entry = kite_sql
322+
.state
323+
.meta_cache()
324+
.get(&(table_name.clone(), 1))
325+
.expect("missing statistics cache entry");
326+
assert!(entry.is_none());
292327

293328
Ok(())
294329
}
@@ -330,7 +365,17 @@ mod test {
330365
while iter.try_next()?.is_some() {
331366
keys += 1;
332367
}
333-
assert_eq!(keys, 1 + DEFAULT_NUM_OF_BUCKETS.min(DEFAULT_NUM_OF_BUCKETS));
368+
let table_name = "t1".to_string().into();
369+
let loader = transaction.meta_loader(kite_sql.state.meta_cache());
370+
let statistics_meta = loader.load(&table_name, 0)?.unwrap();
371+
let statistics_sketch = transaction
372+
.statistics_sketch(table_name.as_ref(), 0)?
373+
.unwrap();
374+
let expected_keys = 1
375+
+ 1
376+
+ statistics_sketch.storage_page_count(COUNT_MIN_SKETCH_STORAGE_PAGE_LEN)
377+
+ statistics_meta.histogram().buckets_len();
378+
assert_eq!(keys, expected_keys);
334379

335380
Ok(())
336381
}

src/optimizer/core/cm_sketch.rs

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use crate::expression::range_detacher::Range;
1717
use crate::serdes::{ReferenceSerialization, ReferenceTables};
1818
use crate::storage::{TableCache, Transaction};
1919
use crate::types::value::DataValue;
20+
use kite_sql_serde_macros::ReferenceSerialization;
2021
use siphasher::sip::SipHasher13;
2122
use std::borrow::Borrow;
2223
use std::hash::{Hash, Hasher};
@@ -25,6 +26,51 @@ use std::marker::PhantomData;
2526
use std::{cmp, mem};
2627

2728
pub(crate) type FastHasher = SipHasher13;
29+
pub(crate) const COUNT_MIN_SKETCH_STORAGE_PAGE_LEN: usize = 16 * 1024;
30+
31+
#[derive(Debug, Clone, ReferenceSerialization)]
32+
pub struct CountMinSketchMeta {
33+
width: usize,
34+
k_num: usize,
35+
page_len: usize,
36+
hasher_0: FastHasher,
37+
hasher_1: FastHasher,
38+
}
39+
40+
impl CountMinSketchMeta {
41+
pub fn width(&self) -> usize {
42+
self.width
43+
}
44+
45+
pub fn k_num(&self) -> usize {
46+
self.k_num
47+
}
48+
49+
pub fn page_len(&self) -> usize {
50+
self.page_len
51+
}
52+
}
53+
54+
impl CountMinSketchPage {
55+
pub fn row_idx(&self) -> usize {
56+
self.row_idx
57+
}
58+
59+
pub fn page_idx(&self) -> usize {
60+
self.page_idx
61+
}
62+
63+
pub fn counters(&self) -> &[usize] {
64+
&self.counters
65+
}
66+
}
67+
68+
#[derive(Debug, Clone, ReferenceSerialization)]
69+
pub struct CountMinSketchPage {
70+
row_idx: usize,
71+
page_idx: usize,
72+
counters: Vec<usize>,
73+
}
2874

2975
// https://github.com/jedisct1/rust-count-min-sketch
3076
#[derive(Debug, Clone)]
@@ -37,6 +83,121 @@ pub struct CountMinSketch<K> {
3783
phantom_k: PhantomData<K>,
3884
}
3985

86+
impl<K> CountMinSketch<K> {
87+
pub fn storage_page_count(&self, page_len: usize) -> usize {
88+
self.counters
89+
.iter()
90+
.map(|row| row.len().div_ceil(page_len))
91+
.sum()
92+
}
93+
94+
pub fn into_storage_parts(
95+
self,
96+
page_len: usize,
97+
) -> (CountMinSketchMeta, impl Iterator<Item = CountMinSketchPage>) {
98+
let CountMinSketch {
99+
counters,
100+
hashers,
101+
mask,
102+
k_num,
103+
..
104+
} = self;
105+
let width = mask + 1;
106+
let meta = CountMinSketchMeta {
107+
width,
108+
k_num,
109+
page_len,
110+
hasher_0: hashers[0].clone(),
111+
hasher_1: hashers[1].clone(),
112+
};
113+
let pages = counters
114+
.into_iter()
115+
.enumerate()
116+
.flat_map(move |(row_idx, counters)| {
117+
let page_count = counters.len().div_ceil(page_len);
118+
(0..page_count).map(move |page_idx| {
119+
let start = page_idx * page_len;
120+
let end = ((page_idx + 1) * page_len).min(counters.len());
121+
122+
CountMinSketchPage {
123+
row_idx,
124+
page_idx,
125+
counters: counters[start..end].to_vec(),
126+
}
127+
})
128+
});
129+
130+
(meta, pages)
131+
}
132+
133+
pub fn from_storage_parts(
134+
meta: CountMinSketchMeta,
135+
pages: Vec<CountMinSketchPage>,
136+
) -> Result<Self, DatabaseError> {
137+
let width = meta.width;
138+
let k_num = meta.k_num;
139+
let page_len = meta.page_len;
140+
if width == 0 || k_num == 0 || page_len == 0 {
141+
return Err(DatabaseError::InvalidValue(
142+
"count-min sketch storage meta is invalid".to_string(),
143+
));
144+
}
145+
if !width.is_power_of_two() {
146+
return Err(DatabaseError::InvalidValue(
147+
"count-min sketch width must be a power of two".to_string(),
148+
));
149+
}
150+
151+
let mut counters = vec![Vec::with_capacity(width); k_num];
152+
let mut expected_page_idx = vec![0usize; k_num];
153+
154+
for CountMinSketchPage {
155+
row_idx,
156+
page_idx,
157+
counters: page_counters,
158+
} in pages
159+
{
160+
if row_idx >= k_num {
161+
return Err(DatabaseError::InvalidValue(format!(
162+
"count-min sketch row index out of bounds: {row_idx}"
163+
)));
164+
}
165+
if page_idx != expected_page_idx[row_idx] {
166+
return Err(DatabaseError::InvalidValue(format!(
167+
"count-min sketch page sequence is invalid: row={row_idx}, page={page_idx}, expected={}",
168+
expected_page_idx[row_idx]
169+
)));
170+
}
171+
if page_counters.len() > page_len {
172+
return Err(DatabaseError::InvalidValue(format!(
173+
"count-min sketch page is too large: row={row_idx}, page={page_idx}"
174+
)));
175+
}
176+
177+
counters[row_idx].extend(page_counters);
178+
expected_page_idx[row_idx] += 1;
179+
}
180+
181+
for (row_idx, row) in counters.iter().enumerate() {
182+
if row.len() != width {
183+
return Err(DatabaseError::InvalidValue(format!(
184+
"count-min sketch row width mismatch: row={row_idx}, expected={width}, actual={}",
185+
row.len()
186+
)));
187+
}
188+
}
189+
190+
Ok(CountMinSketch {
191+
counters,
192+
offsets: vec![0; k_num],
193+
hashers: [meta.hasher_0, meta.hasher_1],
194+
mask: width - 1,
195+
k_num,
196+
phantom_k: Default::default(),
197+
})
198+
}
199+
}
200+
40201
impl CountMinSketch<DataValue> {
41202
pub fn collect_count(&self, ranges: &[Range]) -> usize {
42203
let mut count = 0;
@@ -256,4 +417,25 @@ mod tests {
256417
300
257418
);
258419
}
420+
421+
#[test]
422+
fn test_storage_parts_roundtrip() {
423+
let mut cms = CountMinSketch::<DataValue>::new(128, 0.95, 10.0);
424+
for i in 0..256 {
425+
cms.increment(&DataValue::Int32(i % 17));
426+
}
427+
428+
let (meta, pages) = cms.clone().into_storage_parts(8);
429+
let rebuilt =
430+
CountMinSketch::<DataValue>::from_storage_parts(meta, pages.collect()).unwrap();
431+
432+
assert_eq!(
433+
cms.estimate(&DataValue::Int32(3)),
434+
rebuilt.estimate(&DataValue::Int32(3))
435+
);
436+
assert_eq!(
437+
cms.estimate(&DataValue::Int32(9)),
438+
rebuilt.estimate(&DataValue::Int32(9))
439+
);
440+
}
259441
}

0 commit comments

Comments
 (0)