Skip to content

Commit 8e916e9

Browse files
authored
key value date support (#3653)
1 parent 140fefc commit 8e916e9

12 files changed

Lines changed: 397 additions & 15 deletions

File tree

nidx/nidx_json/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ mod resource_indexer;
2323
mod schema;
2424
pub mod search;
2525

26+
pub use tantivy::DateTime;
27+
2628
use std::collections::HashSet;
2729
use std::path::Path;
2830

nidx/nidx_json/src/resource_indexer.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,11 @@ pub fn index_json_fields(
4141
let resource_uuid = Uuid::parse_str(resource_id)?;
4242
let encoded = encode_rid(resource_uuid);
4343

44-
let json_field_type = schema.schema.get_field_entry(schema.json).field_type().clone();
45-
4644
// Build a single nested object per resource: { "field_id": <json> }
4745
let mut nested: Vec<(String, OwnedValue)> = Vec::with_capacity(resource.json_fields.len());
4846
for (field_key, json_info) in resource.json_fields.iter() {
4947
let parsed: serde_json::Value = serde_json::from_str(&json_info.value)?;
50-
let owned = json_field_type.value_from_json(parsed)?;
48+
let owned = OwnedValue::from(parsed);
5149
nested.push((field_key.clone(), owned));
5250
}
5351

nidx/nidx_json/src/search.rs

Lines changed: 184 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,18 @@ pub struct JsonPathFilter {
3535
pub enum JsonPredicate {
3636
Text(String),
3737

38-
IntRange { lower: Option<i64>, upper: Option<i64> },
39-
FloatRange { lower: Option<f64>, upper: Option<f64> },
38+
IntRange {
39+
lower: Option<i64>,
40+
upper: Option<i64>,
41+
},
42+
FloatRange {
43+
lower: Option<f64>,
44+
upper: Option<f64>,
45+
},
46+
DateRange {
47+
lower: Option<tantivy::DateTime>,
48+
upper: Option<tantivy::DateTime>,
49+
},
4050
Boolean(bool),
4151
}
4252

@@ -63,9 +73,13 @@ fn build_leaf_query(filter: &JsonPathFilter, json_field: Field) -> Box<dyn Query
6373

6474
match &filter.predicate {
6575
JsonPredicate::Text(val) => {
66-
let mut term = Term::from_field_json_path(json_field, &path, false);
76+
// Use the fast field to do exact match
77+
let mut term = Term::from_field_json_path(json_field, &path, true);
6778
term.append_type_and_str(val);
68-
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
79+
Box::new(FastFieldRangeQuery::new(
80+
Bound::Included(term.clone()),
81+
Bound::Included(term),
82+
))
6983
}
7084

7185
JsonPredicate::IntRange { lower, upper } => {
@@ -101,6 +115,20 @@ fn build_leaf_query(filter: &JsonPathFilter, json_field: Field) -> Box<dyn Query
101115
term.append_type_and_fast_value(*val);
102116
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
103117
}
118+
119+
JsonPredicate::DateRange { lower, upper } => {
120+
let build_bound = |opt: &Option<tantivy::DateTime>| -> Bound<Term> {
121+
match opt {
122+
None => Bound::Unbounded,
123+
Some(v) => {
124+
let mut term = Term::from_field_json_path(json_field, &path, false);
125+
term.append_type_and_fast_value(*v);
126+
Bound::Included(term)
127+
}
128+
}
129+
};
130+
Box::new(FastFieldRangeQuery::new(build_bound(lower), build_bound(upper)))
131+
}
104132
}
105133
}
106134
pub(crate) fn build_tantivy_query(expr: &JsonFilterExpression, json_field: Field) -> Box<dyn Query> {
@@ -247,11 +275,18 @@ mod tests {
247275
#[test]
248276
fn test_exact_match() {
249277
let (svc, apple, _banana, _cherry) = build_test_index();
278+
// Full stored value matches.
250279
let results = search(
251280
&svc,
252-
path("t/product", "name", JsonPredicate::Text("apple".to_string())),
281+
path("t/product", "name", JsonPredicate::Text("red apple".to_string())),
253282
);
254283
assert!(results.contains(&apple));
284+
// Partial token no longer matches (fast-field exact match, not token lookup).
285+
let results = search(
286+
&svc,
287+
path("t/product", "name", JsonPredicate::Text("apple".to_string())),
288+
);
289+
assert!(!results.contains(&apple));
255290
}
256291

257292
#[test]
@@ -382,4 +417,148 @@ mod tests {
382417
assert!(results.contains(&banana));
383418
assert!(!results.contains(&cherry));
384419
}
420+
421+
fn build_date_index() -> (JsonReaderService, Uuid, Uuid, Uuid) {
422+
let schema = JsonSchema::new();
423+
let index = Index::create_in_ram(schema.schema.clone());
424+
let mut writer: IndexWriter = index.writer(15_000_000).expect("writer failed");
425+
426+
let old_id = Uuid::parse_str("00000000000000000000000000000011").unwrap();
427+
let mid_id = Uuid::parse_str("00000000000000000000000000000012").unwrap();
428+
let new_id = Uuid::parse_str("00000000000000000000000000000013").unwrap();
429+
430+
let dt = |secs: i64| OwnedValue::Date(tantivy::DateTime::from_timestamp_secs(secs));
431+
432+
// old: 2020-01-01 00:00:00 UTC (1577836800)
433+
add_doc(
434+
&mut writer,
435+
&schema,
436+
old_id,
437+
"t/event",
438+
vec![("ts".to_string(), dt(1577836800))],
439+
);
440+
// mid: 2022-06-15 00:00:00 UTC (1655251200)
441+
add_doc(
442+
&mut writer,
443+
&schema,
444+
mid_id,
445+
"t/event",
446+
vec![("ts".to_string(), dt(1655251200))],
447+
);
448+
// new: 2024-01-01 00:00:00 UTC (1704067200)
449+
add_doc(
450+
&mut writer,
451+
&schema,
452+
new_id,
453+
"t/event",
454+
vec![("ts".to_string(), dt(1704067200))],
455+
);
456+
457+
writer.commit().expect("commit failed");
458+
let reader = index
459+
.reader_builder()
460+
.reload_policy(tantivy::ReloadPolicy::Manual)
461+
.try_into()
462+
.expect("reader failed");
463+
(JsonReaderService { index, schema, reader }, old_id, mid_id, new_id)
464+
}
465+
466+
#[test]
467+
fn test_exact_match_text_field() {
468+
// Verifies that JsonPredicate::Text does a true exact match against the
469+
// fast-field (columnar) value — no tokenization, case-sensitive.
470+
let schema = JsonSchema::new();
471+
let index = Index::create_in_ram(schema.schema.clone());
472+
let mut writer: IndexWriter = index.writer(15_000_000).expect("writer failed");
473+
474+
let id = Uuid::parse_str("00000000000000000000000000000099").unwrap();
475+
add_doc(
476+
&mut writer,
477+
&schema,
478+
id,
479+
"k/product",
480+
vec![("color".to_string(), OwnedValue::Str("Red Apple".to_string()))],
481+
);
482+
writer.commit().expect("commit failed");
483+
let reader = index
484+
.reader_builder()
485+
.reload_policy(tantivy::ReloadPolicy::Manual)
486+
.try_into()
487+
.expect("reader failed");
488+
let svc = JsonReaderService { index, schema, reader };
489+
490+
// Exact full value matches.
491+
let results = search(
492+
&svc,
493+
path("k/product", "color", JsonPredicate::Text("Red Apple".to_string())),
494+
);
495+
assert!(results.contains(&id), "exact full value should match");
496+
497+
// Partial token does NOT match.
498+
let results = search(&svc, path("k/product", "color", JsonPredicate::Text("red".to_string())));
499+
assert!(!results.contains(&id), "partial/lowercased token should not match");
500+
501+
// Wrong case does NOT match (fast-field match is case-sensitive).
502+
let results = search(
503+
&svc,
504+
path("k/product", "color", JsonPredicate::Text("red apple".to_string())),
505+
);
506+
assert!(!results.contains(&id), "wrong case should not match");
507+
}
508+
509+
#[test]
510+
fn test_date_range_bounded() {
511+
let (svc, _old, mid, _new) = build_date_index();
512+
// [2021-01-01 .. 2023-01-01]
513+
let results = search(
514+
&svc,
515+
path(
516+
"t/event",
517+
"ts",
518+
JsonPredicate::DateRange {
519+
lower: Some(tantivy::DateTime::from_timestamp_secs(1609459200)), // 2021
520+
upper: Some(tantivy::DateTime::from_timestamp_secs(1672531200)), // 2023
521+
},
522+
),
523+
);
524+
assert_eq!(results, HashSet::from([mid]));
525+
}
526+
527+
#[test]
528+
fn test_date_range_unbounded_upper() {
529+
let (svc, _old, mid, new) = build_date_index();
530+
// [2022-01-01 .. ]
531+
let results = search(
532+
&svc,
533+
path(
534+
"t/event",
535+
"ts",
536+
JsonPredicate::DateRange {
537+
lower: Some(tantivy::DateTime::from_timestamp_secs(1640995200)), // 2022
538+
upper: None,
539+
},
540+
),
541+
);
542+
assert!(results.contains(&mid));
543+
assert!(results.contains(&new));
544+
assert!(!results.contains(&_old));
545+
}
546+
547+
#[test]
548+
fn test_date_range_unbounded_lower() {
549+
let (svc, old, _mid, _new) = build_date_index();
550+
// [ .. 2021-01-01]
551+
let results = search(
552+
&svc,
553+
path(
554+
"t/event",
555+
"ts",
556+
JsonPredicate::DateRange {
557+
lower: None,
558+
upper: Some(tantivy::DateTime::from_timestamp_secs(1609459200)), // 2021
559+
},
560+
),
561+
);
562+
assert_eq!(results, HashSet::from([old]));
563+
}
385564
}

nidx/nidx_protos/nodereader.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,11 +339,17 @@ message JsonFieldPathFilter {
339339
optional double upper = 2;
340340
}
341341

342+
message DateRangePredicate {
343+
optional google.protobuf.Timestamp lower = 1;
344+
optional google.protobuf.Timestamp upper = 2;
345+
}
346+
342347
oneof predicate {
343348
string text = 3;
344349
IntegerRangePredicate int_range = 4;
345350
FloatRangePredicate float_range = 5;
346351
bool boolean = 6;
352+
DateRangePredicate date_range = 7;
347353
}
348354
}
349355

nidx/src/searcher/query_planner.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,14 @@ fn proto_to_json_filter(expr: &nidx_protos::JsonFilterExpression) -> anyhow::Res
249249
upper: r.upper,
250250
},
251251
Predicate::Boolean(b) => JsonPredicate::Boolean(*b),
252+
Predicate::DateRange(r) => {
253+
let ts_to_dt =
254+
|ts: &nidx_protos::prost_types::Timestamp| nidx_json::DateTime::from_timestamp_secs(ts.seconds);
255+
JsonPredicate::DateRange {
256+
lower: r.lower.as_ref().map(ts_to_dt),
257+
upper: r.upper.as_ref().map(ts_to_dt),
258+
}
259+
}
252260
};
253261
Ok(JsonFilterExpression::Path(JsonPathFilter {
254262
field_id: path_filter.field_id.clone(),

nucliadb/src/nucliadb/common/filter_expression.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
Keyword,
4040
Kind,
4141
KVBoolMatch,
42+
KVDateRange,
4243
KVExactMatch,
4344
KVFilterExpression,
4445
KVRange,
@@ -180,6 +181,16 @@ def parse_kv_filter_expression(
180181
)
181182
path.boolean = expr.value
182183
return nodereader_pb2.JsonFilterExpression(path=path)
184+
elif isinstance(expr, KVDateRange):
185+
path = nodereader_pb2.JsonFieldPathFilter(
186+
field_id=f"k/{expr.field_id}",
187+
json_path=expr.key,
188+
)
189+
if expr.gte is not None:
190+
path.date_range.lower.FromDatetime(expr.gte)
191+
if expr.lte is not None:
192+
path.date_range.upper.FromDatetime(expr.lte)
193+
return nodereader_pb2.JsonFilterExpression(path=path)
183194
else:
184195
assert_never(expr)
185196

@@ -206,7 +217,7 @@ def _parse_kv_filter_expression(
206217
result = nodereader_pb2.JsonFilterExpression()
207218
result.bool_not.CopyFrom(_parse_kv_filter_expression(expr.operand, all_schemas, kbid))
208219
return result
209-
elif isinstance(expr, (KVExactMatch, KVRange, KVBoolMatch)):
220+
elif isinstance(expr, (KVExactMatch, KVRange, KVBoolMatch, KVDateRange)):
210221
schema = all_schemas.schemas.get(expr.field_id)
211222
if schema is None:
212223
raise InvalidQueryError("key_value", f"Unknown key-value schema: '{expr.field_id}'")
@@ -235,6 +246,12 @@ def _parse_kv_filter_expression(
235246
f"Key '{expr.key}' in schema '{expr.field_id}' is of type '{schema_field.type}', "
236247
f"but 'bool_match' requires type 'boolean'",
237248
)
249+
elif isinstance(expr, KVDateRange) and schema_field.type != "date":
250+
raise InvalidQueryError(
251+
"key_value",
252+
f"Key '{expr.key}' in schema '{expr.field_id}' is of type '{schema_field.type}', "
253+
f"but 'date_range' requires type 'date'",
254+
)
238255
return parse_kv_filter_expression(expr)
239256
else:
240257
assert_never(expr)

nucliadb/src/nucliadb/ingest/fields/key_value.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#
2020
from __future__ import annotations
2121

22+
from datetime import datetime
23+
2224
from typing_extensions import assert_never
2325

2426
from nucliadb.ingest.fields.base import Field
@@ -57,16 +59,41 @@ def _validate_keys(data: dict, schema: KVSchema) -> None:
5759
def check_kv_type(schema_name: str, key: str, value: object, expected: KVFieldType) -> None:
5860
ok = False
5961
if expected is KVFieldType.TEXT:
60-
ok = isinstance(value, str)
62+
if isinstance(value, str):
63+
try:
64+
dt = datetime.fromisoformat(value)
65+
# Tantivy's JSON indexer auto-parses strings as DateTime only when
66+
# they parse as RFC 3339, which requires both a time component and a
67+
# timezone offset (Z or ±HH:MM).
68+
ok = dt.tzinfo is None
69+
except ValueError:
70+
ok = True # not parseable as a date at all, safe
71+
else:
72+
ok = False
6173
elif expected is KVFieldType.INTEGER:
6274
ok = isinstance(value, int) and not isinstance(value, bool)
6375
elif expected is KVFieldType.FLOAT:
6476
ok = isinstance(value, (int, float)) and not isinstance(value, bool)
6577
elif expected is KVFieldType.BOOLEAN:
6678
ok = isinstance(value, bool)
79+
elif expected is KVFieldType.DATE:
80+
# Dates must be stored as ISO-8601 strings (e.g. "2024-01-15T00:00:00Z")
81+
if isinstance(value, str):
82+
try:
83+
datetime.fromisoformat(value)
84+
ok = True
85+
except ValueError:
86+
ok = False
87+
else:
88+
ok = False
6789
else:
6890
assert_never(expected)
6991
if not ok:
92+
if expected is KVFieldType.TEXT and isinstance(value, str):
93+
raise ValueError(
94+
f"Key {key!r} in schema {schema_name!r} expects type 'text', but the value looks like "
95+
f"a date. Use a 'date' field type for date values."
96+
)
7097
raise ValueError(
7198
f"Key {key!r} in schema {schema_name!r} expects type {expected.value!r}, got {type(value).__name__}"
7299
)

0 commit comments

Comments
 (0)