Skip to content

Commit d59cdfe

Browse files
authored
Fix name tracker (#19856)
## Which issue does this PR close? - Closes #17508 ## Rationale for this change The previous implementation used UUID-based aliasing as a workaround to prevent duplicate names for literals in Substrait plans. This approach had several drawbacks: - Non-deterministic plan names that made testing difficult (requiring UUID regex filters) - Only addressed literal naming conflicts, not the broader issue of name deduplication - Added unnecessary dependency on the `uuid` crate - Didn't properly handle cases where the same qualified name could appear with different schema representations ## What changes are included in this PR? 1. Enhanced NameTracker: Refactored to detect two types of conflicts: - Duplicate schema names: Tracked via schema_name() to prevent validate_unique_names failures (e.g., two Utf8(NULL) literals) - Ambiguous references: Tracked via qualified_name() to prevent DFSchema::check_names failures when a qualified field (e.g., left.Utf8(NULL)) and unqualified field (e.g., Utf8(NULL)) share the same column name 2. **Removed UUID dependency**: Eliminated the `uuid` crate from `datafusion/substrait` 3. **Removed literal-specific aliasing**: The UUID-based workaround in `project_rel.rs` is no longer needed as the improved NameTracker handles all naming conflicts consistently 4. **Deterministic naming**: Name conflicts now use predictable `__temp__N` suffixes instead of random UUIDs Note: This doesn't fully fix all the issues in #17508 which allow some special casing of `CAST` which are not included here. ## Are these changes tested? Yes: - Updated snapshot tests to reflect the new deterministic naming (e.g., `Utf8("people")__temp__0` instead of UUID-based names) - Modified some roundtrip tests to verify semantic equivalence (schema matching and execution) rather than exact string matching, which is more robust - All existing integration tests pass with the new naming scheme ## Are there any user-facing changes? Minimal. The generated plan names are now deterministic and more readable (using `__temp__N` suffixes instead of UUIDs), but this is primarily an internal representation change. The functional behavior and query results remain unchanged.
1 parent b6d46a6 commit d59cdfe

7 files changed

Lines changed: 283 additions & 98 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/substrait/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ prost = { workspace = true }
4747
substrait = { version = "0.62", features = ["serde"] }
4848
url = { workspace = true }
4949
tokio = { workspace = true, features = ["fs"] }
50-
uuid = { workspace = true, features = ["v4"] }
5150

5251
[dev-dependencies]
5352
datafusion = { workspace = true, features = ["nested_expressions", "unicode_expressions"] }

datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,20 +62,7 @@ pub async fn from_project_rel(
6262
// to transform it into a column reference
6363
window_exprs.insert(e.clone());
6464
}
65-
// Substrait plans are ordinal based, so they do not provide names for columns.
66-
// Names for columns are generated by Datafusion during conversion, and for literals
67-
// Datafusion produces names based on the literal value. It is possible to construct
68-
// valid Substrait plans that result in duplicated names if the same literal value is
69-
// used in multiple relations. To avoid this issue, we alias literals with unique names.
70-
// The name tracker will ensure that two literals in the same project would have
71-
// unique names but, it does not ensure that if a literal column exists in a previous
72-
// project say before a join that it is deduplicated with respect to those columns.
73-
// See: https://github.com/apache/datafusion/pull/17299
74-
let maybe_apply_alias = match e {
75-
lit @ Expr::Literal(_, _) => lit.alias(uuid::Uuid::new_v4().to_string()),
76-
_ => e,
77-
};
78-
explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?);
65+
explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?);
7966
}
8067

8168
let input = if !window_exprs.is_empty() {

datafusion/substrait/src/logical_plan/consumer/utils.rs

Lines changed: 198 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use datafusion::common::{
2323
};
2424
use datafusion::logical_expr::expr::Sort;
2525
use datafusion::logical_expr::{Cast, Expr, ExprSchemable};
26+
use datafusion::sql::TableReference;
2627
use std::collections::HashSet;
2728
use std::sync::Arc;
2829
use substrait::proto::SortField;
@@ -359,35 +360,71 @@ fn compatible_nullabilities(
359360
}
360361

361362
pub(super) struct NameTracker {
362-
seen_names: HashSet<String>,
363-
}
364-
365-
pub(super) enum NameTrackerStatus {
366-
NeverSeen,
367-
SeenBefore,
363+
/// Tracks seen schema names (from expr.schema_name()).
364+
/// Used to detect duplicates that would fail validate_unique_names.
365+
seen_schema_names: HashSet<String>,
366+
/// Tracks column names that have been seen with a qualifier.
367+
/// Used to detect ambiguous references (qualified + unqualified with same name).
368+
qualified_names: HashSet<String>,
369+
/// Tracks column names that have been seen without a qualifier.
370+
/// Used to detect ambiguous references.
371+
unqualified_names: HashSet<String>,
368372
}
369373

370374
impl NameTracker {
371375
pub(super) fn new() -> Self {
372376
NameTracker {
373-
seen_names: HashSet::default(),
377+
seen_schema_names: HashSet::default(),
378+
qualified_names: HashSet::default(),
379+
unqualified_names: HashSet::default(),
374380
}
375381
}
376-
pub(super) fn get_unique_name(
377-
&mut self,
378-
name: String,
379-
) -> (String, NameTrackerStatus) {
380-
match self.seen_names.insert(name.clone()) {
381-
true => (name, NameTrackerStatus::NeverSeen),
382-
false => {
383-
let mut counter = 0;
384-
loop {
385-
let candidate_name = format!("{name}__temp__{counter}");
386-
if self.seen_names.insert(candidate_name.clone()) {
387-
return (candidate_name, NameTrackerStatus::SeenBefore);
388-
}
389-
counter += 1;
390-
}
382+
383+
/// Check if the expression would cause a conflict either in:
384+
/// 1. validate_unique_names (duplicate schema_name)
385+
/// 2. DFSchema::check_names (ambiguous reference)
386+
fn would_conflict(&self, expr: &Expr) -> bool {
387+
let (qualifier, name) = expr.qualified_name();
388+
let schema_name = expr.schema_name().to_string();
389+
self.would_conflict_inner((qualifier, &name), &schema_name)
390+
}
391+
392+
fn would_conflict_inner(
393+
&self,
394+
qualified_name: (Option<TableReference>, &str),
395+
schema_name: &str,
396+
) -> bool {
397+
// Check for duplicate schema_name (would fail validate_unique_names)
398+
if self.seen_schema_names.contains(schema_name) {
399+
return true;
400+
}
401+
402+
// Check for ambiguous reference (would fail DFSchema::check_names)
403+
// This happens when a qualified field and unqualified field have the same name
404+
let (qualifier, name) = qualified_name;
405+
match qualifier {
406+
Some(_) => {
407+
// Adding a qualified name - conflicts if unqualified version exists
408+
self.unqualified_names.contains(name)
409+
}
410+
None => {
411+
// Adding an unqualified name - conflicts if qualified version exists
412+
self.qualified_names.contains(name)
413+
}
414+
}
415+
}
416+
417+
fn insert(&mut self, expr: &Expr) {
418+
let schema_name = expr.schema_name().to_string();
419+
self.seen_schema_names.insert(schema_name);
420+
421+
let (qualifier, name) = expr.qualified_name();
422+
match qualifier {
423+
Some(_) => {
424+
self.qualified_names.insert(name);
425+
}
426+
None => {
427+
self.unqualified_names.insert(name);
391428
}
392429
}
393430
}
@@ -396,10 +433,25 @@ impl NameTracker {
396433
&mut self,
397434
expr: Expr,
398435
) -> datafusion::common::Result<Expr> {
399-
match self.get_unique_name(expr.name_for_alias()?) {
400-
(_, NameTrackerStatus::NeverSeen) => Ok(expr),
401-
(name, NameTrackerStatus::SeenBefore) => Ok(expr.alias(name)),
436+
if !self.would_conflict(&expr) {
437+
self.insert(&expr);
438+
return Ok(expr);
402439
}
440+
441+
// Name collision - need to generate a unique alias
442+
let schema_name = expr.schema_name().to_string();
443+
let mut counter = 0;
444+
let candidate_name = loop {
445+
let candidate_name = format!("{schema_name}__temp__{counter}");
446+
// .alias always produces an unqualified name so check for conflicts accordingly.
447+
if !self.would_conflict_inner((None, &candidate_name), &candidate_name) {
448+
break candidate_name;
449+
}
450+
counter += 1;
451+
};
452+
let candidate_expr = expr.alias(&candidate_name);
453+
self.insert(&candidate_expr);
454+
Ok(candidate_expr)
403455
}
404456
}
405457

@@ -469,13 +521,14 @@ pub(crate) fn from_substrait_precision(
469521

470522
#[cfg(test)]
471523
pub(crate) mod tests {
472-
use super::make_renamed_schema;
524+
use super::{NameTracker, make_renamed_schema};
473525
use crate::extensions::Extensions;
474526
use crate::logical_plan::consumer::DefaultSubstraitConsumer;
475527
use datafusion::arrow::datatypes::{DataType, Field};
476528
use datafusion::common::DFSchema;
477529
use datafusion::error::Result;
478530
use datafusion::execution::SessionState;
531+
use datafusion::logical_expr::{Expr, col};
479532
use datafusion::prelude::SessionContext;
480533
use datafusion::sql::TableReference;
481534
use std::collections::HashMap;
@@ -641,4 +694,123 @@ pub(crate) mod tests {
641694
);
642695
Ok(())
643696
}
697+
698+
#[test]
699+
fn name_tracker_unique_names_pass_through() -> Result<()> {
700+
let mut tracker = NameTracker::new();
701+
702+
// First expression should pass through unchanged
703+
let expr1 = col("a");
704+
let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
705+
assert_eq!(result1, col("a"));
706+
707+
// Different name should also pass through unchanged
708+
let expr2 = col("b");
709+
let result2 = tracker.get_uniquely_named_expr(expr2)?;
710+
assert_eq!(result2, col("b"));
711+
712+
Ok(())
713+
}
714+
715+
#[test]
716+
fn name_tracker_duplicate_schema_name_gets_alias() -> Result<()> {
717+
let mut tracker = NameTracker::new();
718+
719+
// First expression with name "a"
720+
let expr1 = col("a");
721+
let result1 = tracker.get_uniquely_named_expr(expr1)?;
722+
assert_eq!(result1, col("a"));
723+
724+
// Second expression with same name "a" should get aliased
725+
let expr2 = col("a");
726+
let result2 = tracker.get_uniquely_named_expr(expr2)?;
727+
assert_eq!(result2, col("a").alias("a__temp__0"));
728+
729+
// Third expression with same name "a" should get a different alias
730+
let expr3 = col("a");
731+
let result3 = tracker.get_uniquely_named_expr(expr3)?;
732+
assert_eq!(result3, col("a").alias("a__temp__1"));
733+
734+
Ok(())
735+
}
736+
737+
#[test]
738+
fn name_tracker_qualified_then_unqualified_conflicts() -> Result<()> {
739+
let mut tracker = NameTracker::new();
740+
741+
// First: qualified column "table.a"
742+
let qualified_col =
743+
Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
744+
let result1 = tracker.get_uniquely_named_expr(qualified_col)?;
745+
assert_eq!(
746+
result1,
747+
Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
748+
);
749+
750+
// Second: unqualified column "a" - should conflict (ambiguous reference)
751+
let unqualified_col = col("a");
752+
let result2 = tracker.get_uniquely_named_expr(unqualified_col)?;
753+
// Should be aliased to avoid ambiguous reference
754+
assert_eq!(result2, col("a").alias("a__temp__0"));
755+
756+
Ok(())
757+
}
758+
759+
#[test]
760+
fn name_tracker_unqualified_then_qualified_conflicts() -> Result<()> {
761+
let mut tracker = NameTracker::new();
762+
763+
// First: unqualified column "a"
764+
let unqualified_col = col("a");
765+
let result1 = tracker.get_uniquely_named_expr(unqualified_col)?;
766+
assert_eq!(result1, col("a"));
767+
768+
// Second: qualified column "table.a" - should conflict (ambiguous reference)
769+
let qualified_col =
770+
Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
771+
let result2 = tracker.get_uniquely_named_expr(qualified_col)?;
772+
// Should be aliased to avoid ambiguous reference
773+
assert_eq!(
774+
result2,
775+
Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
776+
.alias("table.a__temp__0")
777+
);
778+
779+
Ok(())
780+
}
781+
782+
#[test]
783+
fn name_tracker_different_qualifiers_no_conflict() -> Result<()> {
784+
let mut tracker = NameTracker::new();
785+
786+
// First: qualified column "table1.a"
787+
let col1 = Expr::Column(datafusion::common::Column::new(Some("table1"), "a"));
788+
let result1 = tracker.get_uniquely_named_expr(col1.clone())?;
789+
assert_eq!(result1, col1);
790+
791+
// Second: qualified column "table2.a" - different qualifier, different schema_name
792+
// so should NOT conflict
793+
let col2 = Expr::Column(datafusion::common::Column::new(Some("table2"), "a"));
794+
let result2 = tracker.get_uniquely_named_expr(col2.clone())?;
795+
assert_eq!(result2, col2);
796+
797+
Ok(())
798+
}
799+
800+
#[test]
801+
fn name_tracker_aliased_expressions() -> Result<()> {
802+
let mut tracker = NameTracker::new();
803+
804+
// First: col("x").alias("result")
805+
let expr1 = col("x").alias("result");
806+
let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
807+
assert_eq!(result1, col("x").alias("result"));
808+
809+
// Second: col("y").alias("result") - same alias name, should conflict
810+
let expr2 = col("y").alias("result");
811+
let result2 = tracker.get_uniquely_named_expr(expr2)?;
812+
assert_eq!(result2, col("y").alias("result").alias("result__temp__0"));
813+
814+
Ok(())
815+
}
644816
}

datafusion/substrait/tests/cases/consumer_integration.rs

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -651,31 +651,23 @@ mod tests {
651651
#[tokio::test]
652652
async fn test_multiple_unions() -> Result<()> {
653653
let plan_str = test_plan_to_string("multiple_unions.json").await?;
654-
655-
let mut settings = insta::Settings::clone_current();
656-
settings.add_filter(
657-
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
658-
"[UUID]",
659-
);
660-
settings.bind(|| {
661-
assert_snapshot!(
662-
plan_str,
663-
@r#"
664-
Projection: [UUID] AS product_category, [UUID] AS product_type, product_key
665-
Union
666-
Projection: Utf8("people") AS [UUID], Utf8("people") AS [UUID], sales.product_key
667-
Left Join: sales.product_key = food.@food_id
668-
TableScan: sales
669-
TableScan: food
670-
Union
671-
Projection: people.$f3, people.$f5, people.product_key0
672-
Left Join: people.product_key0 = food.@food_id
673-
TableScan: people
674-
TableScan: food
675-
TableScan: more_products
676-
"#
654+
assert_snapshot!(
655+
plan_str,
656+
@r#"
657+
Projection: Utf8("people") AS product_category, Utf8("people")__temp__0 AS product_type, product_key
658+
Union
659+
Projection: Utf8("people"), Utf8("people") AS Utf8("people")__temp__0, sales.product_key
660+
Left Join: sales.product_key = food.@food_id
661+
TableScan: sales
662+
TableScan: food
663+
Union
664+
Projection: people.$f3, people.$f5, people.product_key0
665+
Left Join: people.product_key0 = food.@food_id
666+
TableScan: people
667+
TableScan: food
668+
TableScan: more_products
669+
"#
677670
);
678-
});
679671

680672
Ok(())
681673
}

datafusion/substrait/tests/cases/logical_plans.rs

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -160,28 +160,21 @@ mod tests {
160160
let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
161161
let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
162162

163-
let mut settings = insta::Settings::clone_current();
164-
settings.add_filter(
165-
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
166-
"[UUID]",
163+
assert_snapshot!(
164+
plan,
165+
@r"
166+
Projection: left.A, left.Utf8(NULL) AS C, right.D, Utf8(NULL) AS Utf8(NULL)__temp__0 AS E
167+
Left Join: left.A = right.A
168+
SubqueryAlias: left
169+
Union
170+
Projection: A.A, Utf8(NULL)
171+
TableScan: A
172+
Projection: B.A, CAST(B.C AS Utf8)
173+
TableScan: B
174+
SubqueryAlias: right
175+
TableScan: C
176+
"
167177
);
168-
settings.bind(|| {
169-
assert_snapshot!(
170-
plan,
171-
@r"
172-
Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS [UUID] AS E
173-
Left Join: left.A = right.A
174-
SubqueryAlias: left
175-
Union
176-
Projection: A.A, Utf8(NULL) AS [UUID]
177-
TableScan: A
178-
Projection: B.A, CAST(B.C AS Utf8)
179-
TableScan: B
180-
SubqueryAlias: right
181-
TableScan: C
182-
"
183-
);
184-
});
185178

186179
// Trigger execution to ensure plan validity
187180
DataFrame::new(ctx.state(), plan).show().await?;

0 commit comments

Comments
 (0)