diff --git a/rust/lance-graph/src/ast.rs b/rust/lance-graph/src/ast.rs index fa1c8cec..e242ea5e 100644 --- a/rust/lance-graph/src/ast.rs +++ b/rust/lance-graph/src/ast.rs @@ -223,6 +223,21 @@ pub enum BooleanExpression { expression: ValueExpression, pattern: String, }, + /// CONTAINS substring matching + Contains { + expression: ValueExpression, + substring: String, + }, + /// STARTS WITH prefix matching + StartsWith { + expression: ValueExpression, + prefix: String, + }, + /// ENDS WITH suffix matching + EndsWith { + expression: ValueExpression, + suffix: String, + }, /// IS NULL pattern matching IsNull(ValueExpression), /// IS NOT NULL pattern matching diff --git a/rust/lance-graph/src/datafusion_planner/expression.rs b/rust/lance-graph/src/datafusion_planner/expression.rs index a804e0da..8d58e7d3 100644 --- a/rust/lance-graph/src/datafusion_planner/expression.rs +++ b/rust/lance-graph/src/datafusion_planner/expression.rs @@ -13,6 +13,17 @@ use datafusion_functions_aggregate::min_max::max; use datafusion_functions_aggregate::min_max::min; use datafusion_functions_aggregate::sum::sum; +/// Helper function to create LIKE expressions with consistent settings +fn create_like_expr(expression: &ValueExpression, pattern: &str, case_insensitive: bool) -> Expr { + Expr::Like(datafusion::logical_expr::Like { + negated: false, + expr: Box::new(to_df_value_expr(expression)), + pattern: Box::new(lit(pattern.to_string())), + escape_char: None, + case_insensitive, + }) +} + /// Convert BooleanExpression to DataFusion Expr pub(crate) fn to_df_boolean_expr(expr: &BooleanExpression) -> Expr { use crate::ast::{BooleanExpression as BE, ComparisonOperator as CO}; @@ -63,13 +74,25 @@ pub(crate) fn to_df_boolean_expr(expr: &BooleanExpression) -> Expr { BE::Like { expression, pattern, - } => Expr::Like(datafusion::logical_expr::Like { - negated: false, - expr: Box::new(to_df_value_expr(expression)), - pattern: Box::new(lit(pattern.clone())), - escape_char: None, - case_insensitive: false, - }), + } => create_like_expr(expression, pattern, false), + BE::Contains { + expression, + substring, + } => { + // CONTAINS is equivalent to LIKE '%substring%' + let pattern = format!("%{}%", substring); + create_like_expr(expression, &pattern, false) + } + BE::StartsWith { expression, prefix } => { + // STARTS WITH is equivalent to LIKE 'prefix%' + let pattern = format!("{}%", prefix); + create_like_expr(expression, &pattern, false) + } + BE::EndsWith { expression, suffix } => { + // ENDS WITH is equivalent to LIKE '%suffix' + let pattern = format!("%{}", suffix); + create_like_expr(expression, &pattern, false) + } } } @@ -472,6 +495,165 @@ mod tests { assert!(s.contains("p__email"), "Should contain column reference"); } + #[test] + fn test_boolean_expr_contains() { + let expr = BooleanExpression::Contains { + expression: ValueExpression::Property(PropertyRef { + variable: "p".into(), + property: "name".into(), + }), + substring: "ali".into(), + }; + + if let Expr::Like(like_expr) = to_df_boolean_expr(&expr) { + assert!(!like_expr.negated, "Should not be negated"); + assert!(!like_expr.case_insensitive, "Should be case sensitive"); + assert_eq!(like_expr.escape_char, None, "Should have no escape char"); + + // Check the expression is the column + match *like_expr.expr { + Expr::Column(ref col_expr) => { + assert_eq!(col_expr.name(), "p__name"); + } + other => panic!("Expected column expression, got {:?}", other), + } + + // Check pattern is '%ali%' + match *like_expr.pattern { + Expr::Literal(ref scalar, _) => { + let s = format!("{:?}", scalar); + assert!(s.contains("%ali%"), "Pattern should be '%ali%', got: {}", s); + } + other => panic!("Expected literal pattern, got {:?}", other), + } + } else { + panic!("Expected Like expression"); + } + } + + #[test] + fn test_boolean_expr_starts_with() { + let expr = BooleanExpression::StartsWith { + expression: ValueExpression::Property(PropertyRef { + variable: "p".into(), + property: "email".into(), + }), + prefix: "admin".into(), + }; + + if let Expr::Like(like_expr) = to_df_boolean_expr(&expr) { + assert!(!like_expr.negated, "Should not be negated"); + assert!(!like_expr.case_insensitive, "Should be case sensitive"); + + // Check the expression is the column + match *like_expr.expr { + Expr::Column(ref col_expr) => { + assert_eq!(col_expr.name(), "p__email"); + } + other => panic!("Expected column expression, got {:?}", other), + } + + // Check pattern is 'admin%' + match *like_expr.pattern { + Expr::Literal(ref scalar, _) => { + let s = format!("{:?}", scalar); + assert!( + s.contains("admin%"), + "Pattern should be 'admin%', got: {}", + s + ); + } + other => panic!("Expected literal pattern, got {:?}", other), + } + } else { + panic!("Expected Like expression"); + } + } + + #[test] + fn test_boolean_expr_ends_with() { + let expr = BooleanExpression::EndsWith { + expression: ValueExpression::Property(PropertyRef { + variable: "p".into(), + property: "email".into(), + }), + suffix: "@example.com".into(), + }; + + if let Expr::Like(like_expr) = to_df_boolean_expr(&expr) { + assert!(!like_expr.negated, "Should not be negated"); + assert!(!like_expr.case_insensitive, "Should be case sensitive"); + + // Check the expression is the column + match *like_expr.expr { + Expr::Column(ref col_expr) => { + assert_eq!(col_expr.name(), "p__email"); + } + other => panic!("Expected column expression, got {:?}", other), + } + + // Check pattern is '%@example.com' + match *like_expr.pattern { + Expr::Literal(ref scalar, _) => { + let s = format!("{:?}", scalar); + assert!( + s.contains("%@example.com"), + "Pattern should be '%@example.com', got: {}", + s + ); + } + other => panic!("Expected literal pattern, got {:?}", other), + } + } else { + panic!("Expected Like expression"); + } + } + + #[test] + fn test_boolean_expr_contains_case_sensitivity() { + // Test that CONTAINS is case-sensitive (case_insensitive = false) + let expr = BooleanExpression::Contains { + expression: ValueExpression::Property(PropertyRef { + variable: "p".into(), + property: "name".into(), + }), + substring: "Test".into(), + }; + + if let Expr::Like(like_expr) = to_df_boolean_expr(&expr) { + assert!( + !like_expr.case_insensitive, + "CONTAINS should be case-sensitive by default" + ); + } else { + panic!("Expected Like expression"); + } + } + + #[test] + fn test_boolean_expr_string_operators_with_variable() { + // Test that string operators work with variable references, not just properties + let expr = BooleanExpression::Contains { + expression: ValueExpression::Variable("name".into()), + substring: "test".into(), + }; + + if let Expr::Like(like_expr) = to_df_boolean_expr(&expr) { + match *like_expr.expr { + Expr::Column(ref col_expr) => { + assert_eq!( + col_expr.name(), + "name", + "Should reference variable directly" + ); + } + other => panic!("Expected column expression, got {:?}", other), + } + } else { + panic!("Expected Like expression"); + } + } + // ======================================================================== // Unit tests for to_df_value_expr() // ======================================================================== diff --git a/rust/lance-graph/src/parser.rs b/rust/lance-graph/src/parser.rs index b7f62d51..d3044b86 100644 --- a/rust/lance-graph/src/parser.rs +++ b/rust/lance-graph/src/parser.rs @@ -341,6 +341,52 @@ fn comparison_expression(input: &str) -> IResult<&str, BooleanExpression> { }, )); } + // Match CONTAINS substring + if let Ok((input_after_contains, (_, _, substring))) = + tuple((tag_no_case("CONTAINS"), multispace0, string_literal))(input) + { + return Ok(( + input_after_contains, + BooleanExpression::Contains { + expression: left, + substring, + }, + )); + } + // Match STARTS WITH prefix (note: multi-word operator) + if let Ok((input_after_starts, (_, _, _, _, prefix))) = tuple(( + tag_no_case("STARTS"), + multispace1, + tag_no_case("WITH"), + multispace0, + string_literal, + ))(input) + { + return Ok(( + input_after_starts, + BooleanExpression::StartsWith { + expression: left, + prefix, + }, + )); + } + // Match ENDS WITH suffix (note: multi-word operator) + if let Ok((input_after_ends, (_, _, _, _, suffix))) = tuple(( + tag_no_case("ENDS"), + multispace1, + tag_no_case("WITH"), + multispace0, + string_literal, + ))(input) + { + return Ok(( + input_after_ends, + BooleanExpression::EndsWith { + expression: left, + suffix, + }, + )); + } // Match is null if let Ok((rest, ())) = is_null_comparison(input) { return Ok((rest, BooleanExpression::IsNull(left_clone))); @@ -1102,4 +1148,121 @@ mod tests { _ => panic!("Expected AND expression"), } } + + #[test] + fn test_parse_contains() { + let query = "MATCH (n:Person) WHERE n.name CONTAINS 'Jo' RETURN n.name"; + let result = parse_cypher_query(query); + assert!(result.is_ok()); + + let query = result.unwrap(); + assert!(query.where_clause.is_some()); + + match &query.where_clause.unwrap().expression { + BooleanExpression::Contains { + expression, + substring, + } => { + assert_eq!(substring, "Jo"); + match expression { + ValueExpression::Property(prop) => { + assert_eq!(prop.variable, "n"); + assert_eq!(prop.property, "name"); + } + _ => panic!("Expected property reference"), + } + } + _ => panic!("Expected CONTAINS expression"), + } + } + + #[test] + fn test_parse_starts_with() { + let query = "MATCH (n:Person) WHERE n.name STARTS WITH 'Alice' RETURN n.name"; + let result = parse_cypher_query(query); + assert!(result.is_ok()); + + let query = result.unwrap(); + assert!(query.where_clause.is_some()); + + match &query.where_clause.unwrap().expression { + BooleanExpression::StartsWith { expression, prefix } => { + assert_eq!(prefix, "Alice"); + match expression { + ValueExpression::Property(prop) => { + assert_eq!(prop.variable, "n"); + assert_eq!(prop.property, "name"); + } + _ => panic!("Expected property reference"), + } + } + _ => panic!("Expected STARTS WITH expression"), + } + } + + #[test] + fn test_parse_ends_with() { + let query = "MATCH (n:Person) WHERE n.email ENDS WITH '@example.com' RETURN n.email"; + let result = parse_cypher_query(query); + assert!(result.is_ok()); + + let query = result.unwrap(); + assert!(query.where_clause.is_some()); + + match &query.where_clause.unwrap().expression { + BooleanExpression::EndsWith { expression, suffix } => { + assert_eq!(suffix, "@example.com"); + match expression { + ValueExpression::Property(prop) => { + assert_eq!(prop.variable, "n"); + assert_eq!(prop.property, "email"); + } + _ => panic!("Expected property reference"), + } + } + _ => panic!("Expected ENDS WITH expression"), + } + } + + #[test] + fn test_parse_contains_case_insensitive_keyword() { + let query = "MATCH (n:Person) WHERE n.name contains 'test' RETURN n.name"; + let result = parse_cypher_query(query); + assert!(result.is_ok()); + + match &result.unwrap().where_clause.unwrap().expression { + BooleanExpression::Contains { substring, .. } => { + assert_eq!(substring, "test"); + } + _ => panic!("Expected CONTAINS expression"), + } + } + + #[test] + fn test_parse_string_operators_in_complex_where() { + let query = + "MATCH (n:Person) WHERE n.name CONTAINS 'Jo' AND n.email ENDS WITH '.com' RETURN n"; + let result = parse_cypher_query(query); + assert!(result.is_ok()); + + match &result.unwrap().where_clause.unwrap().expression { + BooleanExpression::And(left, right) => { + // Left should be CONTAINS + match **left { + BooleanExpression::Contains { ref substring, .. } => { + assert_eq!(substring, "Jo"); + } + _ => panic!("Expected CONTAINS expression on left"), + } + // Right should be ENDS WITH + match **right { + BooleanExpression::EndsWith { ref suffix, .. } => { + assert_eq!(suffix, ".com"); + } + _ => panic!("Expected ENDS WITH expression on right"), + } + } + _ => panic!("Expected AND expression"), + } + } } diff --git a/rust/lance-graph/src/semantic.rs b/rust/lance-graph/src/semantic.rs index 80c592fd..bbf1731f 100644 --- a/rust/lance-graph/src/semantic.rs +++ b/rust/lance-graph/src/semantic.rs @@ -247,6 +247,15 @@ impl SemanticAnalyzer { BooleanExpression::Like { expression, .. } => { self.analyze_value_expression(expression)?; } + BooleanExpression::Contains { expression, .. } => { + self.analyze_value_expression(expression)?; + } + BooleanExpression::StartsWith { expression, .. } => { + self.analyze_value_expression(expression)?; + } + BooleanExpression::EndsWith { expression, .. } => { + self.analyze_value_expression(expression)?; + } BooleanExpression::IsNull(expression) => { self.analyze_value_expression(expression)?; } diff --git a/rust/lance-graph/tests/test_datafusion_pipeline.rs b/rust/lance-graph/tests/test_datafusion_pipeline.rs index d67bdc04..8ee7451f 100644 --- a/rust/lance-graph/tests/test_datafusion_pipeline.rs +++ b/rust/lance-graph/tests/test_datafusion_pipeline.rs @@ -3778,7 +3778,7 @@ async fn test_datafusion_is_not_null_relationship_property() { } // ============================================================================ -// LIKE Pattern Matching Tests +// String Operator Tests // ============================================================================ #[tokio::test] @@ -3907,3 +3907,211 @@ async fn test_datafusion_like_case_sensitive() { // Should not match 'Alice' (lowercase 'a' vs uppercase 'A') assert_eq!(result.num_rows(), 0); } + +#[tokio::test] +async fn test_datafusion_contains_basic() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + + let query = CypherQuery::new("MATCH (p:Person) WHERE p.name CONTAINS 'li' RETURN p.name") + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Should match "Alice" and "Charlie" (contains "li") + assert_eq!(result.num_rows(), 2); + + let names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let mut name_vec: Vec = (0..result.num_rows()) + .map(|i| names.value(i).to_string()) + .collect(); + name_vec.sort(); + + assert_eq!(name_vec, vec!["Alice", "Charlie"]); +} + +#[tokio::test] +async fn test_datafusion_starts_with_basic() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + + let query = CypherQuery::new("MATCH (p:Person) WHERE p.name STARTS WITH 'A' RETURN p.name") + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Should match only "Alice" + assert_eq!(result.num_rows(), 1); + + let names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(names.value(0), "Alice"); +} + +#[tokio::test] +async fn test_datafusion_ends_with_basic() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + + let query = CypherQuery::new("MATCH (p:Person) WHERE p.name ENDS WITH 'e' RETURN p.name") + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Should match "Alice", "Charlie", and "Eve" (ends with "e") + assert_eq!(result.num_rows(), 3); + + let names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let mut name_vec: Vec = (0..result.num_rows()) + .map(|i| names.value(i).to_string()) + .collect(); + name_vec.sort(); + + assert_eq!(name_vec, vec!["Alice", "Charlie", "Eve"]); +} + +#[tokio::test] +async fn test_datafusion_contains_case_sensitive() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + + let query = CypherQuery::new("MATCH (p:Person) WHERE p.name CONTAINS 'A' RETURN p.name") + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Should match only "Alice" (capital A) + // No other names contain capital 'A' + assert_eq!(result.num_rows(), 1); + + let names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(names.value(0), "Alice"); +} + +#[tokio::test] +async fn test_datafusion_string_operators_combined() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + + let query = CypherQuery::new( + "MATCH (p:Person) WHERE p.name STARTS WITH 'C' AND p.name ENDS WITH 'e' RETURN p.name", + ) + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Should match only "Charlie" (starts with 'C' and ends with 'e') + assert_eq!(result.num_rows(), 1); + + let names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(names.value(0), "Charlie"); +} + +#[tokio::test] +async fn test_datafusion_contains_in_relationship_query() { + let config = create_graph_config(); + let person_batch = create_person_dataset(); + let knows_batch = create_knows_dataset(); + + let query = CypherQuery::new("MATCH (a:Person)-[:KNOWS]->(b:Person) WHERE a.name CONTAINS 'li' AND b.age > 30 RETURN a.name, b.name") + .unwrap() + .with_config(config); + + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch); + datasets.insert("KNOWS".to_string(), knows_batch); + + let result = query + .execute(datasets, Some(ExecutionStrategy::DataFusion)) + .await + .unwrap(); + + // Alice knows Bob (35) and Charlie (30 - not > 30) + // Charlie knows David (40) + // So we should get: Alice->Bob, Charlie->David + assert_eq!(result.num_rows(), 2); + + let a_names = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let b_names = result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + let mut pairs: Vec<(String, String)> = (0..result.num_rows()) + .map(|i| (a_names.value(i).to_string(), b_names.value(i).to_string())) + .collect(); + pairs.sort(); + + assert_eq!( + pairs, + vec![ + ("Alice".to_string(), "Bob".to_string()), + ("Charlie".to_string(), "David".to_string()) + ] + ); +}