Skip to content

Commit f103b40

Browse files
feat(query): implement fuzzy_match filter operator
1 parent ff3e6da commit f103b40

5 files changed

Lines changed: 126 additions & 4 deletions

File tree

config/schemas/graph_query.schema.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,11 @@
454454
"properties": {
455455
"op": {
456456
"type": "string",
457-
"description": "Filter operator. Comparison: eq, gt, lt, gte, lte. Set membership: in (array value). Substring matching: contains, starts_with, ends_with. Nullability: is_null, is_not_null (no value). Word matching: token_match (exact word match), all_tokens (all words present), any_tokens (any word present) — only on fields that support word matching.",
458-
"enum": ["eq", "gt", "lt", "gte", "lte", "in", "contains", "starts_with", "ends_with", "is_null", "is_not_null", "token_match", "all_tokens", "any_tokens"]
457+
"description": "Filter operator. Comparison: eq, gt, lt, gte, lte. Set membership: in (array value). Substring matching: contains, starts_with, ends_with. Nullability: is_null, is_not_null (no value). Word matching: token_match (exact word match), all_tokens (all words present), any_tokens (any word present) — only on fields that support word matching. Fuzzy matching: fuzzy_match (case-insensitive fuzzy match) — only on string fields.",
458+
"enum": ["eq", "gt", "lt", "gte", "lte", "in", "contains", "starts_with", "ends_with", "is_null", "is_not_null", "token_match", "all_tokens", "any_tokens", "fuzzy_match"]
459459
},
460460
"value": {
461-
"description": "Filter value. Required for all operators except is_null/is_not_null. Type: array for 'in', string for contains/starts_with/ends_with/token_match/all_tokens/any_tokens, scalar for comparison operators."
461+
"description": "Filter value. Required for all operators except is_null/is_not_null. Type: array for 'in', string for contains/starts_with/ends_with/token_match/all_tokens/any_tokens/fuzzy_match, scalar for comparison operators."
462462
}
463463
},
464464
"additionalProperties": false,
@@ -488,6 +488,10 @@
488488
{
489489
"if": { "properties": { "op": { "enum": ["token_match", "all_tokens", "any_tokens"] } } },
490490
"then": { "required": ["value"], "properties": { "value": { "type": "string", "maxLength": 1024 } } }
491+
},
492+
{
493+
"if": { "properties": { "op": { "const": "fuzzy_match" } } },
494+
"then": { "required": ["value"], "properties": { "value": { "type": "string", "maxLength": 1024 } } }
491495
}
492496
]
493497
},

crates/query-engine/compiler/src/input.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,8 @@ pub enum FilterOp {
510510
AllTokens,
511511
/// Any token present via `hasAnyTokens()`. Requires a text index on the column.
512512
AnyTokens,
513+
/// Fuzzy string matching via ClickHouse `ngramDistanceCaseInsensitive`.
514+
FuzzyMatch,
513515
}
514516

515517
fn deserialize_filters<'de, D>(

crates/query-engine/compiler/src/passes/codegen/clickhouse.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,4 +1041,41 @@ mod tests {
10411041

10421042
assert_eq!(pq.render(), "SELECT {p0:String} AND {p1:Int64}");
10431043
}
1044+
1045+
#[test]
1046+
fn fuzzy_match_operator() {
1047+
use crate::input::{FilterOp, InputFilter};
1048+
use crate::passes::shared::filter_to_expr;
1049+
1050+
let filter = InputFilter {
1051+
op: Some(FilterOp::FuzzyMatch),
1052+
value: Some(Value::from("antigravity")),
1053+
data_type: Some(ontology::DataType::String),
1054+
..Default::default()
1055+
};
1056+
let expr = filter_to_expr("n", "name", &filter);
1057+
let q = Query {
1058+
select: vec![SelectExpr {
1059+
expr: Expr::col("n", "id"),
1060+
alias: None,
1061+
}],
1062+
from: TableRef::scan("nodes", "n"),
1063+
where_clause: Some(expr),
1064+
..Default::default()
1065+
};
1066+
1067+
let result = codegen(&Node::Query(Box::new(q)), empty_ctx(), QueryConfig::empty()).unwrap();
1068+
assert_eq!(
1069+
result.sql,
1070+
"SELECT n.id FROM nodes AS n WHERE (ngramDistanceCaseInsensitive(n.name, {p0:String}) < {p1:Float64})"
1071+
);
1072+
assert_eq!(
1073+
result.params.get("p0").map(|p| &p.value),
1074+
Some(&Value::from("antigravity"))
1075+
);
1076+
assert_eq!(
1077+
result.params.get("p1").map(|p| &p.value),
1078+
Some(&Value::from(0.3))
1079+
);
1080+
}
10441081
}

crates/query-engine/compiler/src/passes/shared.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,14 @@ pub fn filter_to_expr(alias: &str, prop: &str, filter: &InputFilter) -> Expr {
6565
"hasAnyTokens",
6666
vec![col, Expr::param(ChType::String, str_val())],
6767
),
68+
Some(FilterOp::FuzzyMatch) => Expr::binary(
69+
Op::Lt,
70+
Expr::func(
71+
"ngramDistanceCaseInsensitive",
72+
vec![col, Expr::param(ChType::String, str_val())],
73+
),
74+
Expr::param(ChType::Float64, serde_json::Value::from(0.3)),
75+
),
6876
}
6977
}
7078

crates/query-engine/compiler/src/passes/validate.rs

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,14 +534,20 @@ impl<'a> Validator<'a> {
534534
) -> Result<()> {
535535
let op = filter.op.unwrap_or(FilterOp::Eq);
536536

537+
if op == FilterOp::FuzzyMatch && data_type != DataType::String {
538+
return Err(QueryError::Validation(format!(
539+
"filter on \"{prop}\" for {entity}: fuzzy_match requires a string field, got {data_type}"
540+
)));
541+
}
542+
537543
// Ops without a value — nothing to type-check.
538544
if matches!(op, FilterOp::IsNull | FilterOp::IsNotNull) {
539545
return Ok(());
540546
}
541547

542548
let is_like_op = matches!(
543549
op,
544-
FilterOp::Contains | FilterOp::StartsWith | FilterOp::EndsWith
550+
FilterOp::Contains | FilterOp::StartsWith | FilterOp::EndsWith | FilterOp::FuzzyMatch
545551
);
546552

547553
let is_token_op = matches!(
@@ -2507,6 +2513,71 @@ mod tests {
25072513
);
25082514
}
25092515

2516+
#[test]
2517+
fn rejects_fuzzy_match_on_disallowed_field() {
2518+
let ont = ontology_with_sensitive_field();
2519+
let validator = Validator::new(&ont);
2520+
let input = parse_input(
2521+
r#"{
2522+
"query_type": "traversal",
2523+
"node": {"id": "u", "entity": "User",
2524+
"filters": {"email": {"op": "fuzzy_match", "value": "example"}}},
2525+
"limit": 10
2526+
}"#,
2527+
)
2528+
.unwrap();
2529+
2530+
let err = validator.check_references(&input).unwrap_err();
2531+
assert!(
2532+
err.to_string().contains("LIKE operators"),
2533+
"expected like_allowed rejection, got: {err}"
2534+
);
2535+
}
2536+
2537+
#[test]
2538+
fn rejects_short_fuzzy_match_pattern() {
2539+
let ont = test_ontology();
2540+
let validator = Validator::new(&ont);
2541+
let input = parse_input(
2542+
r#"{
2543+
"query_type": "traversal",
2544+
"node": {"id": "u", "entity": "User",
2545+
"filters": {"username": {"op": "fuzzy_match", "value": "ab"}}},
2546+
"limit": 10
2547+
}"#,
2548+
)
2549+
.unwrap();
2550+
2551+
let err = validator.check_references(&input).unwrap_err();
2552+
assert!(
2553+
err.to_string()
2554+
.contains("search pattern must be at least 3 characters"),
2555+
"expected min length error, got: {err}"
2556+
);
2557+
}
2558+
2559+
#[test]
2560+
fn rejects_fuzzy_match_on_non_string_field() {
2561+
let ont = test_ontology();
2562+
let validator = Validator::new(&ont);
2563+
let input = parse_input(
2564+
r#"{
2565+
"query_type": "traversal",
2566+
"node": {"id": "u", "entity": "User",
2567+
"filters": {"created_at": {"op": "fuzzy_match", "value": "2024-01-01"}}},
2568+
"limit": 10
2569+
}"#,
2570+
)
2571+
.unwrap();
2572+
2573+
let err = validator.check_references(&input).unwrap_err();
2574+
assert!(
2575+
err.to_string()
2576+
.contains("fuzzy_match requires a string field"),
2577+
"expected non-string field rejection, got: {err}"
2578+
);
2579+
}
2580+
25102581
#[test]
25112582
fn rejects_filter_on_unfilterable_field() {
25122583
let ont = ontology_with_unfilterable_field();

0 commit comments

Comments
 (0)