Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nidx/nidx_json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ mod resource_indexer;
mod schema;
pub mod search;

pub use tantivy::DateTime;

use std::collections::HashSet;
use std::path::Path;

Expand Down
4 changes: 1 addition & 3 deletions nidx/nidx_json/src/resource_indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ pub fn index_json_fields(
let resource_uuid = Uuid::parse_str(resource_id)?;
let encoded = encode_rid(resource_uuid);

let json_field_type = schema.schema.get_field_entry(schema.json).field_type().clone();

// Build a single nested object per resource: { "field_id": <json> }
let mut nested: Vec<(String, OwnedValue)> = Vec::with_capacity(resource.json_fields.len());
for (field_key, json_info) in resource.json_fields.iter() {
let parsed: serde_json::Value = serde_json::from_str(&json_info.value)?;
let owned = json_field_type.value_from_json(parsed)?;
let owned = OwnedValue::from(parsed);
nested.push((field_key.clone(), owned));
}

Expand Down
189 changes: 184 additions & 5 deletions nidx/nidx_json/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,18 @@ pub struct JsonPathFilter {
pub enum JsonPredicate {
Text(String),

IntRange { lower: Option<i64>, upper: Option<i64> },
FloatRange { lower: Option<f64>, upper: Option<f64> },
IntRange {
lower: Option<i64>,
upper: Option<i64>,
},
FloatRange {
lower: Option<f64>,
upper: Option<f64>,
},
DateRange {
lower: Option<tantivy::DateTime>,
upper: Option<tantivy::DateTime>,
},
Boolean(bool),
}

Expand All @@ -63,9 +73,13 @@ fn build_leaf_query(filter: &JsonPathFilter, json_field: Field) -> Box<dyn Query

match &filter.predicate {
JsonPredicate::Text(val) => {
let mut term = Term::from_field_json_path(json_field, &path, false);
// Use the fast field to do exact match
let mut term = Term::from_field_json_path(json_field, &path, true);
term.append_type_and_str(val);
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
Box::new(FastFieldRangeQuery::new(
Bound::Included(term.clone()),
Bound::Included(term),
))
}

JsonPredicate::IntRange { lower, upper } => {
Expand Down Expand Up @@ -101,6 +115,20 @@ fn build_leaf_query(filter: &JsonPathFilter, json_field: Field) -> Box<dyn Query
term.append_type_and_fast_value(*val);
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
}

JsonPredicate::DateRange { lower, upper } => {
let build_bound = |opt: &Option<tantivy::DateTime>| -> Bound<Term> {
match opt {
None => Bound::Unbounded,
Some(v) => {
let mut term = Term::from_field_json_path(json_field, &path, false);
term.append_type_and_fast_value(*v);
Bound::Included(term)
}
}
};
Box::new(FastFieldRangeQuery::new(build_bound(lower), build_bound(upper)))
}
}
}
pub(crate) fn build_tantivy_query(expr: &JsonFilterExpression, json_field: Field) -> Box<dyn Query> {
Expand Down Expand Up @@ -247,11 +275,18 @@ mod tests {
#[test]
fn test_exact_match() {
let (svc, apple, _banana, _cherry) = build_test_index();
// Full stored value matches.
let results = search(
&svc,
path("t/product", "name", JsonPredicate::Text("apple".to_string())),
path("t/product", "name", JsonPredicate::Text("red apple".to_string())),
);
assert!(results.contains(&apple));
// Partial token no longer matches (fast-field exact match, not token lookup).
let results = search(
&svc,
path("t/product", "name", JsonPredicate::Text("apple".to_string())),
);
assert!(!results.contains(&apple));
}

#[test]
Expand Down Expand Up @@ -382,4 +417,148 @@ mod tests {
assert!(results.contains(&banana));
assert!(!results.contains(&cherry));
}

fn build_date_index() -> (JsonReaderService, Uuid, Uuid, Uuid) {
let schema = JsonSchema::new();
let index = Index::create_in_ram(schema.schema.clone());
let mut writer: IndexWriter = index.writer(15_000_000).expect("writer failed");

let old_id = Uuid::parse_str("00000000000000000000000000000011").unwrap();
let mid_id = Uuid::parse_str("00000000000000000000000000000012").unwrap();
let new_id = Uuid::parse_str("00000000000000000000000000000013").unwrap();

let dt = |secs: i64| OwnedValue::Date(tantivy::DateTime::from_timestamp_secs(secs));

// old: 2020-01-01 00:00:00 UTC (1577836800)
add_doc(
&mut writer,
&schema,
old_id,
"t/event",
vec![("ts".to_string(), dt(1577836800))],
);
// mid: 2022-06-15 00:00:00 UTC (1655251200)
add_doc(
&mut writer,
&schema,
mid_id,
"t/event",
vec![("ts".to_string(), dt(1655251200))],
);
// new: 2024-01-01 00:00:00 UTC (1704067200)
add_doc(
&mut writer,
&schema,
new_id,
"t/event",
vec![("ts".to_string(), dt(1704067200))],
);

writer.commit().expect("commit failed");
let reader = index
.reader_builder()
.reload_policy(tantivy::ReloadPolicy::Manual)
.try_into()
.expect("reader failed");
(JsonReaderService { index, schema, reader }, old_id, mid_id, new_id)
}

#[test]
fn test_exact_match_text_field() {
// Verifies that JsonPredicate::Text does a true exact match against the
// fast-field (columnar) value — no tokenization, case-sensitive.
let schema = JsonSchema::new();
let index = Index::create_in_ram(schema.schema.clone());
let mut writer: IndexWriter = index.writer(15_000_000).expect("writer failed");

let id = Uuid::parse_str("00000000000000000000000000000099").unwrap();
add_doc(
&mut writer,
&schema,
id,
"k/product",
vec![("color".to_string(), OwnedValue::Str("Red Apple".to_string()))],
);
writer.commit().expect("commit failed");
let reader = index
.reader_builder()
.reload_policy(tantivy::ReloadPolicy::Manual)
.try_into()
.expect("reader failed");
let svc = JsonReaderService { index, schema, reader };

// Exact full value matches.
let results = search(
&svc,
path("k/product", "color", JsonPredicate::Text("Red Apple".to_string())),
);
assert!(results.contains(&id), "exact full value should match");

// Partial token does NOT match.
let results = search(&svc, path("k/product", "color", JsonPredicate::Text("red".to_string())));
assert!(!results.contains(&id), "partial/lowercased token should not match");

// Wrong case does NOT match (fast-field match is case-sensitive).
let results = search(
&svc,
path("k/product", "color", JsonPredicate::Text("red apple".to_string())),
);
assert!(!results.contains(&id), "wrong case should not match");
}

#[test]
fn test_date_range_bounded() {
let (svc, _old, mid, _new) = build_date_index();
// [2021-01-01 .. 2023-01-01]
let results = search(
&svc,
path(
"t/event",
"ts",
JsonPredicate::DateRange {
lower: Some(tantivy::DateTime::from_timestamp_secs(1609459200)), // 2021
upper: Some(tantivy::DateTime::from_timestamp_secs(1672531200)), // 2023
},
),
);
assert_eq!(results, HashSet::from([mid]));
}

#[test]
fn test_date_range_unbounded_upper() {
let (svc, _old, mid, new) = build_date_index();
// [2022-01-01 .. ]
let results = search(
&svc,
path(
"t/event",
"ts",
JsonPredicate::DateRange {
lower: Some(tantivy::DateTime::from_timestamp_secs(1640995200)), // 2022
upper: None,
},
),
);
assert!(results.contains(&mid));
assert!(results.contains(&new));
assert!(!results.contains(&_old));
}

#[test]
fn test_date_range_unbounded_lower() {
let (svc, old, _mid, _new) = build_date_index();
// [ .. 2021-01-01]
let results = search(
&svc,
path(
"t/event",
"ts",
JsonPredicate::DateRange {
lower: None,
upper: Some(tantivy::DateTime::from_timestamp_secs(1609459200)), // 2021
},
),
);
assert_eq!(results, HashSet::from([old]));
}
}
6 changes: 6 additions & 0 deletions nidx/nidx_protos/nodereader.proto
Original file line number Diff line number Diff line change
Expand Up @@ -339,11 +339,17 @@ message JsonFieldPathFilter {
optional double upper = 2;
}

message DateRangePredicate {
optional google.protobuf.Timestamp lower = 1;
optional google.protobuf.Timestamp upper = 2;
}

oneof predicate {
string text = 3;
IntegerRangePredicate int_range = 4;
FloatRangePredicate float_range = 5;
bool boolean = 6;
DateRangePredicate date_range = 7;
}
}

Expand Down
8 changes: 8 additions & 0 deletions nidx/src/searcher/query_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,14 @@ fn proto_to_json_filter(expr: &nidx_protos::JsonFilterExpression) -> anyhow::Res
upper: r.upper,
},
Predicate::Boolean(b) => JsonPredicate::Boolean(*b),
Predicate::DateRange(r) => {
let ts_to_dt =
|ts: &nidx_protos::prost_types::Timestamp| nidx_json::DateTime::from_timestamp_secs(ts.seconds);
JsonPredicate::DateRange {
lower: r.lower.as_ref().map(ts_to_dt),
upper: r.upper.as_ref().map(ts_to_dt),
}
}
};
Ok(JsonFilterExpression::Path(JsonPathFilter {
field_id: path_filter.field_id.clone(),
Expand Down
19 changes: 18 additions & 1 deletion nucliadb/src/nucliadb/common/filter_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Keyword,
Kind,
KVBoolMatch,
KVDateRange,
KVExactMatch,
KVFilterExpression,
KVRange,
Expand Down Expand Up @@ -180,6 +181,16 @@ def parse_kv_filter_expression(
)
path.boolean = expr.value
return nodereader_pb2.JsonFilterExpression(path=path)
elif isinstance(expr, KVDateRange):
path = nodereader_pb2.JsonFieldPathFilter(
field_id=f"k/{expr.field_id}",
json_path=expr.key,
)
if expr.gte is not None:
path.date_range.lower.FromDatetime(expr.gte)
if expr.lte is not None:
path.date_range.upper.FromDatetime(expr.lte)
return nodereader_pb2.JsonFilterExpression(path=path)
else:
assert_never(expr)

Expand All @@ -206,7 +217,7 @@ def _parse_kv_filter_expression(
result = nodereader_pb2.JsonFilterExpression()
result.bool_not.CopyFrom(_parse_kv_filter_expression(expr.operand, all_schemas, kbid))
return result
elif isinstance(expr, (KVExactMatch, KVRange, KVBoolMatch)):
elif isinstance(expr, (KVExactMatch, KVRange, KVBoolMatch, KVDateRange)):
schema = all_schemas.schemas.get(expr.field_id)
if schema is None:
raise InvalidQueryError("key_value", f"Unknown key-value schema: '{expr.field_id}'")
Expand Down Expand Up @@ -235,6 +246,12 @@ def _parse_kv_filter_expression(
f"Key '{expr.key}' in schema '{expr.field_id}' is of type '{schema_field.type}', "
f"but 'bool_match' requires type 'boolean'",
)
elif isinstance(expr, KVDateRange) and schema_field.type != "date":
raise InvalidQueryError(
"key_value",
f"Key '{expr.key}' in schema '{expr.field_id}' is of type '{schema_field.type}', "
f"but 'date_range' requires type 'date'",
)
return parse_kv_filter_expression(expr)
else:
assert_never(expr)
Expand Down
29 changes: 28 additions & 1 deletion nucliadb/src/nucliadb/ingest/fields/key_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#
from __future__ import annotations

from datetime import datetime

from typing_extensions import assert_never

from nucliadb.ingest.fields.base import Field
Expand Down Expand Up @@ -57,16 +59,41 @@ def _validate_keys(data: dict, schema: KVSchema) -> None:
def check_kv_type(schema_name: str, key: str, value: object, expected: KVFieldType) -> None:
ok = False
if expected is KVFieldType.TEXT:
ok = isinstance(value, str)
if isinstance(value, str):
try:
dt = datetime.fromisoformat(value)
# Tantivy's JSON indexer auto-parses strings as DateTime only when
# they parse as RFC 3339, which requires both a time component and a
# timezone offset (Z or ±HH:MM).
ok = dt.tzinfo is None
except ValueError:
ok = True # not parseable as a date at all, safe
else:
ok = False
elif expected is KVFieldType.INTEGER:
ok = isinstance(value, int) and not isinstance(value, bool)
elif expected is KVFieldType.FLOAT:
ok = isinstance(value, (int, float)) and not isinstance(value, bool)
elif expected is KVFieldType.BOOLEAN:
ok = isinstance(value, bool)
elif expected is KVFieldType.DATE:
# Dates must be stored as ISO-8601 strings (e.g. "2024-01-15T00:00:00Z")
if isinstance(value, str):
try:
datetime.fromisoformat(value)
ok = True
except ValueError:
ok = False
else:
ok = False
else:
assert_never(expected)
if not ok:
if expected is KVFieldType.TEXT and isinstance(value, str):
raise ValueError(
f"Key {key!r} in schema {schema_name!r} expects type 'text', but the value looks like "
f"a date. Use a 'date' field type for date values."
)
raise ValueError(
f"Key {key!r} in schema {schema_name!r} expects type {expected.value!r}, got {type(value).__name__}"
)
Loading
Loading