From 76150b278442941cebf20f2ff3593baae5df2581 Mon Sep 17 00:00:00 2001 From: xiedeyantu Date: Wed, 8 Apr 2026 22:05:41 +0800 Subject: [PATCH 1/4] fix: raise AmbiguousReference error for duplicate column names in subquery --- datafusion/common/src/column.rs | 32 ++++- datafusion/common/src/dfschema.rs | 94 ++++++++++++- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 25 +++- datafusion/expr/src/logical_plan/plan.rs | 59 ++++++++- datafusion/sql/tests/sql_integration.rs | 13 ++ datafusion/sqllogictest/test_files/joins.slt | 132 +++++++++++++++++++ 7 files changed, 351 insertions(+), 6 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index c7f0b5a4f4881..9f1587bbf74b5 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -237,7 +237,37 @@ impl Column { .collect::>(); match qualified_fields.len() { 0 => continue, - 1 => return Ok(Column::from(qualified_fields[0])), + 1 => { + // Even a single structural match must be rejected when the + // schema itself has flagged the name as ambiguous (e.g. a + // derived-table subquery that contained two columns with + // the same unqualified name). + let is_ambiguous = schema_level + .iter() + .any(|s| s.ambiguous_names().contains(&self.name)); + if is_ambiguous { + return _schema_err!(SchemaError::AmbiguousReference { + field: Box::new(Column::new_unqualified(&self.name)), + }) + .map_err(|err| { + let mut diagnostic = Diagnostic::new_error( + format!("column '{}' is ambiguous", &self.name), + self.spans().first(), + ); + let columns = schema_level + .iter() + .flat_map(|s| s.columns_with_unqualified_name(&self.name)) + .collect::>(); + add_possible_columns_to_diag( + &mut diagnostic, + &Column::new_unqualified(&self.name), + &columns, + ); + err.with_diagnostic(diagnostic) + }); + } + return Ok(Column::from(qualified_fields[0])); + } _ => { // More than 1 fields in this schema have their names set to self.name. // diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index de0aacf9e8bcd..b44d4d70b0323 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -19,7 +19,7 @@ //! fields with optional relation names. use std::collections::{BTreeSet, HashMap, HashSet}; -use std::fmt::{Display, Formatter}; +use std::fmt::{self, Display, Formatter}; use std::hash::Hash; use std::sync::Arc; @@ -108,7 +108,7 @@ pub type DFSchemaRef = Arc; /// let schema: &Schema = df_schema.as_arrow(); /// assert_eq!(schema.fields().len(), 1); /// ``` -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq, Eq)] pub struct DFSchema { /// Inner Arrow schema reference. inner: SchemaRef, @@ -117,6 +117,26 @@ pub struct DFSchema { field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, + /// Field names that are ambiguous in this schema because the underlying + /// source (e.g. a derived-table subquery) contained multiple columns with + /// the same unqualified name. Any attempt to reference these names without + /// a qualifier should produce an [`SchemaError::AmbiguousReference`] error. + ambiguous_names: Option>>, +} + +impl fmt::Debug for DFSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Show the ambiguous-names set as `{}` when it is empty/absent so that + // existing Debug snapshots are not affected by the Option wrapper. + let empty = HashSet::new(); + let ambiguous = self.ambiguous_names.as_deref().unwrap_or(&empty); + f.debug_struct("DFSchema") + .field("inner", &self.inner) + .field("field_qualifiers", &self.field_qualifiers) + .field("functional_dependencies", &self.functional_dependencies) + .field("ambiguous_names", ambiguous) + .finish() + } } impl DFSchema { @@ -126,6 +146,7 @@ impl DFSchema { inner: Arc::new(Schema::new([])), field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, } } @@ -157,6 +178,7 @@ impl DFSchema { inner: schema, field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; dfschema.check_names()?; Ok(dfschema) @@ -173,6 +195,7 @@ impl DFSchema { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; dfschema.check_names()?; Ok(dfschema) @@ -191,6 +214,7 @@ impl DFSchema { inner: schema.clone().into(), field_qualifiers: vec![Some(qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; schema.check_names()?; Ok(schema) @@ -205,6 +229,7 @@ impl DFSchema { inner: Arc::clone(schema), field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; dfschema.check_names()?; Ok(dfschema) @@ -226,6 +251,7 @@ impl DFSchema { inner: Arc::clone(&self.inner), field_qualifiers: qualifiers, functional_dependencies: self.functional_dependencies.clone(), + ambiguous_names: self.ambiguous_names.clone(), }) } @@ -275,6 +301,35 @@ impl DFSchema { } } + /// Marks the given field names as ambiguous. + /// + /// Ambiguous names correspond to fields that originated from multiple + /// source columns with the same unqualified name (e.g. both sides of a + /// JOIN having an `age` column). Any attempt to resolve such a name + /// without a table qualifier will produce an + /// [`SchemaError::AmbiguousReference`] error. + pub fn with_ambiguous_names(mut self, names: HashSet) -> Self { + self.ambiguous_names = if names.is_empty() { + None + } else { + Some(Arc::new(names)) + }; + self + } + + /// Returns the set of field names that are considered ambiguous in this + /// schema. See [`Self::with_ambiguous_names`]. + /// + /// Returns a reference to an empty set when no ambiguous names have been + /// recorded (the common case). + pub fn ambiguous_names(&self) -> &HashSet { + static EMPTY: std::sync::OnceLock> = + std::sync::OnceLock::new(); + self.ambiguous_names + .as_deref() + .unwrap_or_else(|| EMPTY.get_or_init(HashSet::new)) + } + /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { @@ -294,6 +349,7 @@ impl DFSchema { inner: Arc::new(new_schema_with_metadata), field_qualifiers: new_qualifiers, functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; new_self.check_names()?; Ok(new_self) @@ -350,6 +406,22 @@ impl DFSchema { let finished_with_metadata = finished.with_metadata(metadata); self.inner = finished_with_metadata.into(); self.field_qualifiers.extend(qualifiers); + // Propagate ambiguous names from the other schema so that names marked + // as ambiguous (e.g. by a JOIN) are not silently dropped when schemas + // are merged for ORDER BY / HAVING resolution. + if let Some(other_names) = &other_schema.ambiguous_names { + match &mut self.ambiguous_names { + Some(self_names) => { + // Build a new combined set (Arc prevents in-place mutation). + let mut combined = (**self_names).clone(); + combined.extend(other_names.iter().cloned()); + self.ambiguous_names = Some(Arc::new(combined)); + } + None => { + self.ambiguous_names = Some(Arc::clone(other_names)); + } + } + } } /// Get a list of fields for this schema @@ -506,6 +578,18 @@ impl DFSchema { &self, name: &str, ) -> Result<(Option<&TableReference>, &FieldRef)> { + // If this field name was marked as ambiguous at schema creation time + // (e.g. because a derived-table subquery produced duplicate column + // names), refuse to resolve it without an explicit qualifier. + if self + .ambiguous_names + .as_ref() + .is_some_and(|s| s.contains(name)) + { + return _schema_err!(SchemaError::AmbiguousReference { + field: Box::new(Column::new_unqualified(name.to_string())) + }); + } let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), @@ -845,6 +929,7 @@ impl DFSchema { field_qualifiers: vec![None; self.inner.fields.len()], inner: self.inner, functional_dependencies: self.functional_dependencies, + ambiguous_names: self.ambiguous_names, } } @@ -855,6 +940,7 @@ impl DFSchema { field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], inner: self.inner, functional_dependencies: self.functional_dependencies, + ambiguous_names: self.ambiguous_names, } } @@ -1126,6 +1212,7 @@ impl TryFrom for DFSchema { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; // Without checking names, because schema here may have duplicate field names. // For example, Partial AggregateMode will generate duplicate field names from @@ -1187,6 +1274,7 @@ impl ToDFSchema for Vec { inner: schema.into(), field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; Ok(dfschema) } @@ -1578,6 +1666,7 @@ mod tests { inner: Arc::clone(&arrow_schema_ref), field_qualifiers: vec![None; arrow_schema_ref.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; let df_schema_ref = Arc::new(df_schema.clone()); @@ -1624,6 +1713,7 @@ mod tests { inner: Arc::clone(&schema), field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + ambiguous_names: None, }; assert_eq!(df_schema.inner.metadata(), schema.metadata()) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index bf84fcc53e957..540c8e941015a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3570,7 +3570,7 @@ mod tests { .expect_err("planning error") .strip_backtrace(); - insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#); + insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] }, ambiguous_names: {} }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#); } #[tokio::test] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 5381313e2ee9b..39f84ccb38766 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1745,7 +1745,30 @@ pub fn build_join_schema( .collect(); let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?; - dfschema.with_functional_dependencies(func_dependencies) + let dfschema = dfschema.with_functional_dependencies(func_dependencies)?; + + // Propagate ambiguous names from both input schemas. A name that was + // already ambiguous on either side of the join (e.g. because the left + // input is itself a subquery that wrapped a JOIN) remains ambiguous in + // the output. We only propagate names that actually appear as field + // names in the output schema so we don't accumulate stale entries. + let output_field_names: HashSet<&str> = dfschema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + let inherited_ambiguous: HashSet = left + .ambiguous_names() + .iter() + .chain(right.ambiguous_names()) + .filter(|n| output_field_names.contains(n.as_str())) + .cloned() + .collect(); + if inherited_ambiguous.is_empty() { + Ok(dfschema) + } else { + Ok(dfschema.with_ambiguous_names(inherited_ambiguous)) + } } /// (Re)qualify the sides of a join if needed, i.e. if the columns from one side would otherwise diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 07e0eb1a77aa9..bfb513e661cef 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2375,6 +2375,29 @@ pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result = exprs + .iter() + .filter_map(|e| { + if let Expr::Column(col) = e + && input_ambiguous.contains(&col.name) + { + return Some(col.name.clone()); + } + None + }) + .collect(); + if !inherited.is_empty() { + return Ok(Arc::new(schema.with_ambiguous_names(inherited))); + } + } + Ok(Arc::new(schema)) } @@ -2406,6 +2429,39 @@ impl SubqueryAlias { let aliases = unique_field_aliases(plan.schema().fields()); let is_projection_needed = aliases.iter().any(Option::is_some); + // Collect unqualified field names that are ambiguous in this alias's + // output schema. `unique_field_aliases` renames duplicates (e.g. to + // "id:1") to keep Arrow happy, but outer queries must still be + // prevented from referencing those names without qualification. + // We also inherit names already marked ambiguous by the input schema + // so nested `SELECT * FROM (...) AS sN` wrappers don't lose the marker. + let ambiguous_names: HashSet = { + let mut name_counts: HashMap<&str, usize> = HashMap::new(); + for field in plan.schema().fields() { + *name_counts.entry(field.name().as_str()).or_insert(0) += 1; + } + let mut names: HashSet = name_counts + .into_iter() + .filter(|&(_, count)| count >= 2) + .map(|(name, _)| name.to_string()) + .collect(); + + // Inherit names still visible in the output (the first occurrence + // of a renamed duplicate like "id:1" still keeps the name "id"). + let output_field_names: HashSet<&str> = plan + .schema() + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + for inherited in plan.schema().ambiguous_names() { + if output_field_names.contains(inherited.as_str()) { + names.insert(inherited.clone()); + } + } + names + }; + // Insert a projection node, if needed, to make sure aliases are applied. let plan = if is_projection_needed { let projection_expressions = aliases @@ -2438,7 +2494,8 @@ impl SubqueryAlias { let schema = DFSchemaRef::new( DFSchema::try_from_qualified_schema(alias.clone(), schema)? - .with_functional_dependencies(func_dependencies)?, + .with_functional_dependencies(func_dependencies)? + .with_ambiguous_names(ambiguous_names), ); Ok(SubqueryAlias { input: plan, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index fd606af3a6af0..287e1c472322f 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3938,6 +3938,19 @@ fn order_by_ambiguous_name() { ); } +#[test] +fn order_by_ambiguous_name_via_subquery() { + // `age` is not in the SELECT list; ORDER BY falls back to the FROM schema, + // which is a subquery over a JOIN — `age` must still be flagged ambiguous. + let sql = "SELECT id FROM (SELECT * FROM person a JOIN person b USING (id)) sub ORDER BY age"; + let err = logical_plan(sql).unwrap_err().strip_backtrace(); + + assert_snapshot!( + err, + @"Schema error: Ambiguous reference to unqualified field age" + ); +} + #[test] fn group_by_ambiguous_name() { let sql = "select max(id) from person a join person b using (id) group by age"; diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 1ed3fc89b2642..42bd77eda0153 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -5508,3 +5508,135 @@ DROP TABLE t1; statement ok DROP TABLE t2; + +#### +# Ambiguous unqualified column references through a subquery alias wrapping JOINs. +# When two or more JOIN inputs share a column name the outer query must use +# the qualified form (alias.column); bare unqualified references must be rejected. +#### + +statement ok +CREATE TABLE t_left(id INT, age INT, name VARCHAR) AS VALUES + (1, 10, 'alice'), + (2, 20, 'bob'), + (3, 30, 'carol'); + +statement ok +CREATE TABLE t_right(id INT, age INT, score INT) AS VALUES + (1, 10, 100), + (2, 20, 200), + (4, 40, 400); + +statement ok +CREATE TABLE t_extra(id INT, dept VARCHAR) AS VALUES + (1, 'eng'), + (2, 'sales'), + (5, 'hr'); + +# 2-way join: qualified references to columns shared by both sides work fine +query III rowsort +SELECT sub.id, sub.age, sub.score +FROM (SELECT t_left.id, t_left.age, t_right.score + FROM t_left JOIN t_right ON t_left.id = t_right.id) AS sub; +---- +1 10 100 +2 20 200 + +# 2-way join: unqualified "id" is ambiguous (both sides expose it) +query error DataFusion error: Schema error: Ambiguous reference to unqualified field id +SELECT sub.id FROM (SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id) AS sub WHERE id = 1; + +# 2-way join: unqualified "age" is ambiguous (both sides expose it) +query error DataFusion error: Schema error: Ambiguous reference to unqualified field age +SELECT sub.age FROM (SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id) AS sub WHERE age > 5; + +# 3-way join: qualified references still work when all three tables share "id" +query IIIT rowsort +SELECT sub.id, sub.age, sub.score, sub.dept +FROM (SELECT t_left.id, t_left.age, t_right.score, t_extra.dept + FROM t_left + JOIN t_right ON t_left.id = t_right.id + JOIN t_extra ON t_left.id = t_extra.id) AS sub; +---- +1 10 100 eng +2 20 200 sales + +# 3-way join: unqualified "id" is ambiguous (present in all three tables) +query error DataFusion error: Schema error: Ambiguous reference to unqualified field id +SELECT sub.id FROM (SELECT * FROM t_left + JOIN t_right ON t_left.id = t_right.id + JOIN t_extra ON t_left.id = t_extra.id) AS sub +WHERE id = 1; + +# 3-way join: unqualified "age" is ambiguous (shared by t_left and t_right) +query error DataFusion error: Schema error: Ambiguous reference to unqualified field age +SELECT sub.age FROM (SELECT * FROM t_left + JOIN t_right ON t_left.id = t_right.id + JOIN t_extra ON t_left.id = t_extra.id) AS sub +WHERE age > 5; + +# 3-way join: unambiguous columns (unique to one table) need no qualifier +query IT rowsort +SELECT sub.score, sub.dept +FROM (SELECT t_left.id, t_left.age, t_right.score, t_extra.dept + FROM t_left + JOIN t_right ON t_left.id = t_right.id + JOIN t_extra ON t_left.id = t_extra.id) AS sub; +---- +100 eng +200 sales + +# Nested derived table: double SELECT * wrapper must preserve ambiguity. +# SELECT age FROM ( SELECT * FROM ( SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id ) AS s1 ) AS s2 +# "age" is ambiguous in s1 (from both t_left and t_right) and must stay +# ambiguous when s1 is wrapped in another SELECT * … AS s2. +query error DataFusion error: Schema error: Ambiguous reference to unqualified field age +SELECT age FROM ( + SELECT * FROM ( + SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id + ) AS s1 +) AS s2; + +# Nested derived table: same for "id" +query error DataFusion error: Schema error: Ambiguous reference to unqualified field id +SELECT id FROM ( + SELECT * FROM ( + SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id + ) AS s1 +) AS s2; + +# Join over subquery + table: ambiguous names from the subquery side must +# propagate into the outer join schema so that bare "age" is still rejected. +# Set up a seed table with a single column so only the subquery side has "age". +statement ok +CREATE TABLE seed(val INT) AS VALUES (1), (2); + +query error DataFusion error: Schema error: Ambiguous reference to unqualified field age +SELECT age FROM (SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id) sub +JOIN seed ON true; + +# Qualified access through the subquery alias is still fine even after joining +# with another table. +query II rowsort +SELECT sub.id, sub.score FROM ( + SELECT t_left.id, t_right.score + FROM t_left JOIN t_right ON t_left.id = t_right.id +) sub +JOIN seed ON true; +---- +1 100 +1 100 +2 200 +2 200 + +statement ok +DROP TABLE seed; + +statement ok +DROP TABLE t_left; + +statement ok +DROP TABLE t_right; + +statement ok +DROP TABLE t_extra; From e3e3c55cb03ce205bedd9438e0ef982eb6876cff Mon Sep 17 00:00:00 2001 From: xiedeyantu Date: Wed, 8 Apr 2026 22:10:26 +0800 Subject: [PATCH 2/4] fix test --- datafusion/common/src/dfschema.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index b44d4d70b0323..13e0be8c24dff 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -125,7 +125,7 @@ pub struct DFSchema { } impl fmt::Debug for DFSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { // Show the ambiguous-names set as `{}` when it is empty/absent so that // existing Debug snapshots are not affected by the Option wrapper. let empty = HashSet::new(); @@ -1281,7 +1281,7 @@ impl ToDFSchema for Vec { } impl Display for DFSchema { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!( f, "fields:[{}], metadata:{:?}", @@ -1299,7 +1299,7 @@ impl Display for DFSchema { /// /// Note that this trait is implemented for &[DFSchema] which is /// widely used in the DataFusion codebase. -pub trait ExprSchema: std::fmt::Debug { +pub trait ExprSchema: fmt::Debug { /// Is this column reference nullable? fn nullable(&self, col: &Column) -> Result { Ok(self.field_from_column(col)?.is_nullable()) @@ -1326,7 +1326,7 @@ pub trait ExprSchema: std::fmt::Debug { } // Implement `ExprSchema` for `Arc` -impl + std::fmt::Debug> ExprSchema for P { +impl + fmt::Debug> ExprSchema for P { fn nullable(&self, col: &Column) -> Result { self.as_ref().nullable(col) } From 4e8a1a5193dfd884bcbc8016546310a45f1747e9 Mon Sep 17 00:00:00 2001 From: xiedeyantu Date: Wed, 8 Apr 2026 22:28:50 +0800 Subject: [PATCH 3/4] fix fmt --- datafusion/common/src/dfschema.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 13e0be8c24dff..ca43a9e49f53c 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -323,8 +323,7 @@ impl DFSchema { /// Returns a reference to an empty set when no ambiguous names have been /// recorded (the common case). pub fn ambiguous_names(&self) -> &HashSet { - static EMPTY: std::sync::OnceLock> = - std::sync::OnceLock::new(); + static EMPTY: std::sync::OnceLock> = std::sync::OnceLock::new(); self.ambiguous_names .as_deref() .unwrap_or_else(|| EMPTY.get_or_init(HashSet::new)) From 0c8b96d7249c26119873f704a78fbb7043bbee02 Mon Sep 17 00:00:00 2001 From: xiedeyantu Date: Thu, 9 Apr 2026 07:09:22 +0800 Subject: [PATCH 4/4] fix panic --- datafusion/core/benches/sql_planner.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 59502da987904..fcc8da30fedd9 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -130,7 +130,8 @@ fn register_clickbench_hits_table(rt: &Runtime) -> SessionContext { format!("{BENCHMARKS_PATH_2}{CLICKBENCH_DATA_PATH}") }; - let sql = format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{path}'"); + let sql = + format!("CREATE EXTERNAL TABLE hits_raw STORED AS PARQUET LOCATION '{path}'"); // ClickBench partitioned dataset was written by an ancient version of pyarrow that // that wrote strings with the wrong logical type. To read it correctly, we must @@ -139,6 +140,17 @@ fn register_clickbench_hits_table(rt: &Runtime) -> SessionContext { .unwrap(); rt.block_on(ctx.sql(&sql)).unwrap(); + // ClickBench stores EventDate as UInt16 (days since 1970-01-01). Create a view + // that exposes it as SQL DATE so that queries comparing it with date literals + // (e.g. "EventDate >= '2013-07-01'") work correctly during planning. + rt.block_on(ctx.sql( + "CREATE VIEW hits AS \ + SELECT * EXCEPT (\"EventDate\"), \ + CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" \ + FROM hits_raw", + )) + .unwrap(); + let count = rt.block_on(async { ctx.table("hits").await.unwrap().count().await.unwrap() }); assert!(count > 0);