Skip to content

Commit db9e35e

Browse files
committed
Property test for Comparator/ValueRange consistency, and fixes.
1 parent 7f39d5e commit db9e35e

10 files changed

Lines changed: 789 additions & 268 deletions

File tree

columnar/src/column/mod.rs

Lines changed: 78 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod dictionary_encoded;
22
mod serialize;
33

4+
use std::cell::RefCell;
45
use std::fmt::{self, Debug};
56
use std::io::Write;
67
use std::ops::{Range, RangeInclusive};
@@ -19,6 +20,11 @@ use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
1920
use crate::column_values::{ColumnValues, monotonic_map_column};
2021
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
2122

23+
thread_local! {
24+
static ROWS: RefCell<Vec<RowId>> = const { RefCell::new(Vec::new()) };
25+
static DOCS: RefCell<Vec<DocId>> = const { RefCell::new(Vec::new()) };
26+
}
27+
2228
#[derive(Clone)]
2329
pub struct Column<T = u64> {
2430
pub index: ColumnIndex,
@@ -156,16 +162,15 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
156162
output: &mut Vec<crate::ComparableDoc<Option<T>, DocId>>,
157163
value_range: ValueRange<T>,
158164
) {
159-
// TODO: Move `COLLECT_BLOCK_BUFFER_LEN` to allow for use here, or use a different constant
160-
// in this context.
161-
const BLOCK_LEN: usize = 64; // Corresponds to COLLECT_BLOCK_BUFFER_LEN in tantivy's docset
162165
match (&self.index, value_range) {
163166
(ColumnIndex::Empty { .. }, value_range) => {
164167
let nulls_match = match &value_range {
165168
ValueRange::All => true,
166169
ValueRange::Inclusive(_) => false,
167170
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
171+
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
168172
ValueRange::LessThan(_, nulls_match) => *nulls_match,
173+
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
169174
};
170175
if nulls_match {
171176
for &doc in input_docs {
@@ -178,83 +183,77 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
178183
}
179184
(ColumnIndex::Full, value_range) => {
180185
self.values
181-
.get_vals_in_value_range(input_docs, output, value_range);
186+
.get_vals_in_value_range(input_docs, input_docs, output, value_range);
182187
}
183188
(ColumnIndex::Optional(optional_index), value_range) => {
184-
let len = input_docs.len();
185-
// Ensure the input docids length does not exceed BLOCK_LEN for stack allocation
186-
// safety. If it does, we might need to handle this with multiple
187-
// chunks or fallback to heap. For now, an assert is used to confirm
188-
// expected usage within batch processing limits.
189-
assert!(
190-
len <= BLOCK_LEN,
191-
"Input docids length ({}) exceeds BLOCK_LEN ({})",
192-
len,
193-
BLOCK_LEN
194-
);
195-
196-
let mut input_docs_buffer = [0u32; BLOCK_LEN];
197-
input_docs_buffer[..len].copy_from_slice(input_docs);
198-
199-
let mut dense_row_ids_buffer = [0u32; BLOCK_LEN];
200-
let mut dense_values_buffer = [T::default(); BLOCK_LEN];
201-
let mut presence_mask: u64 = 0; // Bitmask to track which input_docs have a value
202-
let mut num_present = 0;
203-
204-
// Phase 1: Identify existing RowIds and build dense_row_ids_buffer
205-
for (i, &doc_id) in input_docs_buffer[..len].iter().enumerate() {
206-
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
207-
dense_row_ids_buffer[num_present] = row_id;
208-
presence_mask |= 1u64 << i; // Set bit for present docid
209-
num_present += 1;
210-
}
211-
}
212-
213-
// Phase 2: Batch fetch values for present docs
214-
if num_present > 0 {
215-
self.values.get_vals(
216-
&dense_row_ids_buffer[..num_present],
217-
&mut dense_values_buffer[..num_present],
218-
);
219-
}
220-
221-
// Determine if nulls match the value range
222189
let nulls_match = match &value_range {
223190
ValueRange::All => true,
224191
ValueRange::Inclusive(_) => false,
225192
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
193+
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
226194
ValueRange::LessThan(_, nulls_match) => *nulls_match,
195+
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
227196
};
228197

229-
// Phase 3: Filter and merge results, reconstructing docids and values
230-
let mut dense_values_cursor = 0;
231-
for i in 0..len {
232-
let original_doc_id = input_docs_buffer[i];
233-
if (presence_mask & (1u64 << i)) != 0 {
234-
// This doc_id was present in the optional index and has a value
235-
let val = dense_values_buffer[dense_values_cursor];
236-
dense_values_cursor += 1;
237-
238-
// Check if the value matches the value range
239-
let value_matches = match &value_range {
240-
ValueRange::All => true,
241-
ValueRange::Inclusive(r) => r.contains(&val),
242-
ValueRange::GreaterThan(t, _) => val > *t,
243-
ValueRange::LessThan(t, _) => val < *t,
244-
};
198+
let fallback_needed = ROWS.with(|rows_cell| {
199+
DOCS.with(|docs_cell| {
200+
let mut rows = rows_cell.borrow_mut();
201+
let mut docs = docs_cell.borrow_mut();
202+
rows.clear();
203+
docs.clear();
204+
205+
let mut has_nulls = false;
206+
207+
for &doc_id in input_docs {
208+
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
209+
rows.push(row_id);
210+
docs.push(doc_id);
211+
} else {
212+
has_nulls = true;
213+
if nulls_match {
214+
break;
215+
}
216+
}
217+
}
245218

246-
if value_matches {
219+
if !has_nulls || !nulls_match {
220+
self.values.get_vals_in_value_range(
221+
&rows,
222+
&docs,
223+
output,
224+
value_range.clone(),
225+
);
226+
return false;
227+
}
228+
true
229+
})
230+
});
231+
232+
if fallback_needed {
233+
for &doc_id in input_docs {
234+
if let Some(row_id) = optional_index.rank_if_exists(doc_id) {
235+
let val = self.values.get_val(row_id);
236+
let value_matches = match &value_range {
237+
ValueRange::All => true,
238+
ValueRange::Inclusive(r) => r.contains(&val),
239+
ValueRange::GreaterThan(t, _) => val > *t,
240+
ValueRange::GreaterThanOrEqual(t, _) => val >= *t,
241+
ValueRange::LessThan(t, _) => val < *t,
242+
ValueRange::LessThanOrEqual(t, _) => val <= *t,
243+
};
244+
245+
if value_matches {
246+
output.push(crate::ComparableDoc {
247+
doc: doc_id,
248+
sort_key: Some(val),
249+
});
250+
}
251+
} else if nulls_match {
247252
output.push(crate::ComparableDoc {
248-
doc: original_doc_id,
249-
sort_key: Some(val),
253+
doc: doc_id,
254+
sort_key: None,
250255
});
251256
}
252-
} else if nulls_match {
253-
// This doc_id was not present in the optional index (null) and nulls match
254-
output.push(crate::ComparableDoc {
255-
doc: original_doc_id,
256-
sort_key: None,
257-
});
258257
}
259258
}
260259
}
@@ -263,7 +262,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
263262
ValueRange::All => true,
264263
ValueRange::Inclusive(_) => false,
265264
ValueRange::GreaterThan(_, nulls_match) => *nulls_match,
265+
ValueRange::GreaterThanOrEqual(_, nulls_match) => *nulls_match,
266266
ValueRange::LessThan(_, nulls_match) => *nulls_match,
267+
ValueRange::LessThanOrEqual(_, nulls_match) => *nulls_match,
267268
};
268269
for i in 0..input_docs.len() {
269270
let docid = input_docs[i];
@@ -275,7 +276,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static + Default> Column<T> {
275276
ValueRange::All => true,
276277
ValueRange::Inclusive(r) => r.contains(&val),
277278
ValueRange::GreaterThan(t, _) => val > *t,
279+
ValueRange::GreaterThanOrEqual(t, _) => val >= *t,
278280
ValueRange::LessThan(t, _) => val < *t,
281+
ValueRange::LessThanOrEqual(t, _) => val <= *t,
279282
};
280283
if matches {
281284
output.push(crate::ComparableDoc {
@@ -310,9 +313,15 @@ pub enum ValueRange<T> {
310313
/// A range that matches all values greater than the threshold.
311314
/// The boolean flag indicates if null values should be included.
312315
GreaterThan(T, bool),
316+
/// A range that matches all values greater than or equal to the threshold.
317+
/// The boolean flag indicates if null values should be included.
318+
GreaterThanOrEqual(T, bool),
313319
/// A range that matches all values less than the threshold.
314320
/// The boolean flag indicates if null values should be included.
315321
LessThan(T, bool),
322+
/// A range that matches all values less than or equal to the threshold.
323+
/// The boolean flag indicates if null values should be included.
324+
LessThanOrEqual(T, bool),
316325
}
317326

318327
impl<T: PartialOrd> ValueRange<T> {
@@ -321,7 +330,9 @@ impl<T: PartialOrd> ValueRange<T> {
321330
ValueRange::Inclusive(range) => *range.start() <= max && *range.end() >= min,
322331
ValueRange::All => true,
323332
ValueRange::GreaterThan(val, _) => max > *val,
333+
ValueRange::GreaterThanOrEqual(val, _) => max >= *val,
324334
ValueRange::LessThan(val, _) => min < *val,
335+
ValueRange::LessThanOrEqual(val, _) => min <= *val,
325336
}
326337
}
327338
}

0 commit comments

Comments
 (0)