Skip to content

Commit 3df8175

Browse files
authored
fix: Avoid duplicate columns in shared variables (#128)
This PR adds support for reusing variables across multiple patterns in a Cypher query without introducing duplicate column errors. It implements logic in the DataFusion planner to detect when a target variable in a pattern is already bound in the schema, and applies a filter constraint (e.g., `prev.id = next.id`) instead of performing a redundant join.
1 parent 8eced81 commit 3df8175

3 files changed

Lines changed: 367 additions & 46 deletions

File tree

crates/lance-graph/src/datafusion_planner/join_ops.rs

Lines changed: 118 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use crate::config::{NodeMapping, RelationshipMapping};
1616
use crate::error::Result;
1717
use crate::source_catalog::GraphSourceCatalog;
1818
use datafusion::logical_expr::{
19-
col, BinaryExpr, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator,
19+
col, BinaryExpr, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, TableSource,
2020
};
2121
use std::collections::HashMap;
2222
use std::sync::Arc;
@@ -41,6 +41,97 @@ pub(crate) struct TargetJoinParams<'a> {
4141
}
4242

4343
impl DataFusionPlanner {
44+
/// Build a target node scan with property filters and qualified column names
45+
fn build_target_scan(
46+
&self,
47+
target_source: Arc<dyn TableSource>,
48+
target_label: &str,
49+
target_variable: &str,
50+
target_properties: &HashMap<String, PropertyValue>,
51+
) -> Result<LogicalPlan> {
52+
let target_schema = target_source.schema();
53+
let normalized_target_label = target_label.to_lowercase();
54+
let mut target_builder =
55+
LogicalPlanBuilder::scan(&normalized_target_label, target_source, None).map_err(
56+
|e| self.plan_error(&format!("Failed to scan target node '{}'", target_label), e),
57+
)?;
58+
59+
// Apply target property filters (e.g., (b {age: 30}))
60+
for (k, v) in target_properties.iter() {
61+
let lit_expr = super::expression::to_df_value_expr(
62+
&crate::ast::ValueExpression::Literal(v.clone()),
63+
);
64+
let filter_expr = Expr::BinaryExpr(BinaryExpr {
65+
left: Box::new(col(k.to_lowercase())),
66+
op: Operator::Eq,
67+
right: Box::new(lit_expr),
68+
});
69+
target_builder = target_builder.filter(filter_expr).map_err(|e| {
70+
self.plan_error(&format!("Failed to apply target filter on '{}'", k), e)
71+
})?;
72+
}
73+
74+
let target_qualified_exprs: Vec<Expr> = target_schema
75+
.fields()
76+
.iter()
77+
.map(|field| {
78+
let qualified_name = qualify_column(target_variable, field.name());
79+
col(field.name()).alias(&qualified_name)
80+
})
81+
.collect();
82+
83+
target_builder
84+
.project(target_qualified_exprs)
85+
.map_err(|e| self.plan_error("Failed to project target columns", e))?
86+
.build()
87+
.map_err(|e| self.plan_error("Failed to build target scan", e))
88+
}
89+
90+
/// Handle variable reuse case by adding a filter constraint instead of joining
91+
fn handle_variable_reuse_filter(
92+
&self,
93+
mut builder: LogicalPlanBuilder,
94+
params: &TargetJoinParams,
95+
qualified_target_key: String,
96+
) -> Result<LogicalPlan> {
97+
// Determine the relationship target key based on direction
98+
let target_key = match params.direction {
99+
RelationshipDirection::Outgoing => &params.rel_map.target_id_field,
100+
RelationshipDirection::Incoming => &params.rel_map.source_id_field,
101+
RelationshipDirection::Undirected => &params.rel_map.target_id_field,
102+
};
103+
104+
let qualified_rel_target_key = qualify_column(params.rel_qualifier, target_key);
105+
106+
// Create a filter expression: rel_target_key = target_key
107+
let join_condition = Expr::BinaryExpr(BinaryExpr {
108+
left: Box::new(col(&qualified_rel_target_key)),
109+
op: Operator::Eq,
110+
right: Box::new(col(&qualified_target_key)),
111+
});
112+
113+
// Apply filter instead of join
114+
builder = builder
115+
.filter(join_condition)
116+
.map_err(|e| self.plan_error("Failed to apply variable reuse filter", e))?;
117+
118+
// TODO: Handle target_properties filters when variable is reused
119+
// For now, we'll skip them since the variable already exists
120+
if !params.target_properties.is_empty() {
121+
return Err(crate::error::GraphError::PlanError {
122+
message: format!(
123+
"Property filters on reused variable '{}' are not yet supported",
124+
params.target_variable
125+
),
126+
location: snafu::Location::new(file!(), line!(), column!()),
127+
});
128+
}
129+
130+
builder
131+
.build()
132+
.map_err(|e| self.plan_error("Failed to build plan with variable reuse", e))
133+
}
134+
44135
/// Join source node plan with relationship scan
45136
pub(crate) fn join_source_to_relationship(
46137
&self,
@@ -94,43 +185,35 @@ impl DataFusionPlanner {
94185
.map_err(|e| self.plan_error("Failed to build plan (no target source)", e));
95186
};
96187

97-
// Create target node scan with qualified column aliases and property filters
98-
let target_schema = target_source.schema();
99-
let normalized_target_label = target_label.to_lowercase();
100-
let mut target_builder =
101-
LogicalPlanBuilder::scan(&normalized_target_label, target_source, None).map_err(
102-
|e| self.plan_error(&format!("Failed to scan target node '{}'", target_label), e),
103-
)?;
188+
// Check if target variable already exists in the schema (variable reuse)
189+
// Build a temporary plan to inspect the current schema
190+
let current_plan = builder
191+
.clone()
192+
.build()
193+
.map_err(|e| self.plan_error("Failed to build temp plan for schema check", e))?;
194+
let current_schema = current_plan.schema();
104195

105-
// Apply target property filters (e.g., (b {age: 30}))
106-
for (k, v) in params.target_properties.iter() {
107-
let lit_expr = super::expression::to_df_value_expr(
108-
&crate::ast::ValueExpression::Literal(v.clone()),
109-
);
110-
let filter_expr = Expr::BinaryExpr(BinaryExpr {
111-
left: Box::new(col(k.to_lowercase())),
112-
op: Operator::Eq,
113-
right: Box::new(lit_expr),
114-
});
115-
target_builder = target_builder.filter(filter_expr).map_err(|e| {
116-
self.plan_error(&format!("Failed to apply target filter on '{}'", k), e)
117-
})?;
196+
// Check if the target variable's ID column already exists
197+
let qualified_target_key =
198+
qualify_column(params.target_variable, &params.node_map.id_field);
199+
let target_exists = current_schema
200+
.field_with_unqualified_name(&qualified_target_key)
201+
.is_ok();
202+
203+
if target_exists {
204+
// Variable reuse: target node columns already in schema
205+
// Skip creating new scan, just add filter constraint
206+
return self.handle_variable_reuse_filter(builder, params, qualified_target_key);
118207
}
119208

120-
let target_qualified_exprs: Vec<Expr> = target_schema
121-
.fields()
122-
.iter()
123-
.map(|field| {
124-
let qualified_name = qualify_column(params.target_variable, field.name());
125-
col(field.name()).alias(&qualified_name)
126-
})
127-
.collect();
128-
129-
let target_scan = target_builder
130-
.project(target_qualified_exprs)
131-
.map_err(|e| self.plan_error("Failed to project target columns", e))?
132-
.build()
133-
.map_err(|e| self.plan_error("Failed to build target scan", e))?;
209+
// Normal case: target variable doesn't exist yet
210+
// Create target node scan with qualified column aliases and property filters
211+
let target_scan = self.build_target_scan(
212+
target_source,
213+
&target_label,
214+
params.target_variable,
215+
params.target_properties,
216+
)?;
134217

135218
// Determine target join keys
136219
let target_key = match params.direction {
@@ -140,8 +223,6 @@ impl DataFusionPlanner {
140223
};
141224

142225
let qualified_rel_target_key = qualify_column(params.rel_qualifier, target_key);
143-
let qualified_target_key =
144-
qualify_column(params.target_variable, &params.node_map.id_field);
145226

146227
builder = builder
147228
.join(

crates/lance-graph/src/logical_plan.rs

Lines changed: 99 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -338,13 +338,18 @@ impl LogicalPlanner {
338338
.clone()
339339
.unwrap_or_else(|| format!("_node_{}", self.variables.len()));
340340

341-
let label = node
342-
.labels
343-
.first()
341+
// Validate label consistency if variable already exists
342+
self.validate_variable_label(&variable, &node.labels)?;
343+
344+
// Reuse existing label or derive from AST (default to "Node")
345+
let label = self
346+
.variables
347+
.get(&variable)
344348
.cloned()
349+
.or_else(|| node.labels.first().cloned())
345350
.unwrap_or_else(|| "Node".to_string());
346351

347-
// Register variable
352+
// Register variable with its label
348353
self.variables.insert(variable.clone(), label.clone());
349354

350355
Ok(LogicalOperator::ScanByLabel {
@@ -354,7 +359,25 @@ impl LogicalPlanner {
354359
})
355360
}
356361

357-
// (removed) plan_path_segment is superseded by plan_path
362+
/// Validate that a node's explicit label (if any) matches the already-registered
363+
/// label for this variable. Returns `Ok(())` if the variable is new or if labels
364+
/// are consistent, and an error if they conflict.
365+
fn validate_variable_label(&self, variable: &str, ast_labels: &[String]) -> Result<()> {
366+
if let Some(existing_label) = self.variables.get(variable) {
367+
if let Some(ast_label) = ast_labels.first() {
368+
if ast_label != existing_label {
369+
return Err(GraphError::PlanError {
370+
message: format!(
371+
"Variable '{}' already has label '{}', cannot redefine as '{}'",
372+
variable, existing_label, ast_label
373+
),
374+
location: snafu::Location::new(file!(), line!(), column!()),
375+
});
376+
}
377+
}
378+
}
379+
Ok(())
380+
}
358381

359382
/// Plan a full path pattern, respecting the starting variable if provided
360383
fn plan_path(
@@ -375,6 +398,11 @@ impl LogicalPlanner {
375398
None => self.extract_variable_from_plan(&plan)?,
376399
};
377400

401+
// Validate start node label consistency if variable already exists
402+
if let Some(start_var) = &path.start_node.variable {
403+
self.validate_variable_label(start_var, &path.start_node.labels)?;
404+
}
405+
378406
// For each segment, add an expand
379407
for segment in &path.segments {
380408
// Determine / register target variable
@@ -384,12 +412,17 @@ impl LogicalPlanner {
384412
.clone()
385413
.unwrap_or_else(|| format!("_node_{}", self.variables.len()));
386414

387-
let target_label = segment
388-
.end_node
389-
.labels
390-
.first()
415+
// Validate label consistency for already-registered variables
416+
self.validate_variable_label(&target_variable, &segment.end_node.labels)?;
417+
418+
// Reuse existing label or derive from AST (default to "Node")
419+
let target_label = self
420+
.variables
421+
.get(&target_variable)
391422
.cloned()
423+
.or_else(|| segment.end_node.labels.first().cloned())
392424
.unwrap_or_else(|| "Node".to_string());
425+
393426
self.variables
394427
.insert(target_variable.clone(), target_label.clone());
395428

@@ -1171,4 +1204,61 @@ mod tests {
11711204
_ => panic!("Expected Project at top level"),
11721205
}
11731206
}
1207+
1208+
#[test]
1209+
fn test_variable_reuse_across_patterns() {
1210+
let query_text =
1211+
"MATCH (a:Person)-[:KNOWS]->(shared:Person), (shared)-[:KNOWS]->(b:Person) RETURN b.name";
1212+
1213+
let ast = parse_cypher_query(query_text).unwrap();
1214+
let mut planner = LogicalPlanner::new();
1215+
let logical_plan = planner.plan(&ast).unwrap();
1216+
1217+
// Expect: Project { Expand(shared->b) { Expand(a->shared) { Scan(a) } } }
1218+
match &logical_plan {
1219+
LogicalOperator::Project { input, .. } => match input.as_ref() {
1220+
LogicalOperator::Expand {
1221+
input: inner,
1222+
source_variable,
1223+
target_variable,
1224+
..
1225+
} => {
1226+
assert_eq!(source_variable, "shared");
1227+
assert_eq!(target_variable, "b");
1228+
1229+
match inner.as_ref() {
1230+
LogicalOperator::Expand {
1231+
source_variable: first_src,
1232+
target_variable: first_dst,
1233+
..
1234+
} => {
1235+
assert_eq!(first_src, "a");
1236+
assert_eq!(first_dst, "shared");
1237+
}
1238+
_ => panic!("Expected first Expand (a->shared)"),
1239+
}
1240+
}
1241+
_ => panic!("Expected second Expand (shared->b)"),
1242+
},
1243+
_ => panic!("Expected Project at top level"),
1244+
}
1245+
}
1246+
1247+
#[test]
1248+
fn test_variable_reuse_with_conflicting_labels() {
1249+
let query_text =
1250+
"MATCH (a:Person)-[:KNOWS]->(shared:Person), (shared:Company)-[:EMPLOYS]->(b:Person) RETURN b.name";
1251+
1252+
let ast = parse_cypher_query(query_text).unwrap();
1253+
let mut planner = LogicalPlanner::new();
1254+
let err = planner.plan(&ast).unwrap_err();
1255+
let err_msg = err.to_string();
1256+
1257+
assert!(
1258+
err_msg.contains("already has label 'Person'")
1259+
&& err_msg.contains("cannot redefine as 'Company'"),
1260+
"Expected error about label conflict, got: {}",
1261+
err_msg
1262+
);
1263+
}
11741264
}

0 commit comments

Comments
 (0)