diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index c7f0b5a4f4881..0332fa3f59f34 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -439,7 +439,7 @@ mod tests { &[], ) .expect_err("should've failed to find field"); - let expected = "Schema error: No field named z. \ + let expected = "Schema error: No field named z.\n\ Valid fields are t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e."; assert_eq!(err.strip_backtrace(), expected); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index e3da99163ed69..a9013337fbe5b 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -1427,11 +1427,8 @@ mod tests { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; // lookup with unqualified name "t1.c0" let err = schema.index_of_column(&col).unwrap_err(); - let expected = "Schema error: No field named \"t1.c0\". \ - Column names are case sensitive. \ - You can use double quotes to refer to the \"\"t1.c0\"\" column \ - or set the datafusion.sql_parser.enable_ident_normalization configuration. \ - Did you mean 't1.c0'?."; + let expected = "Schema error: No field named \"t1.c0\". Did you mean 't1.c0'?\n\ + Valid fields are t1.c0, t1.c1."; assert_eq!(err.strip_backtrace(), expected); Ok(()) } @@ -1449,12 +1446,47 @@ mod tests { // lookup with unqualified name "t1.c0" let err = schema.index_of_column(&col).unwrap_err(); - let expected = "Schema error: No field named \"t1.c0\". \ + let expected = "Schema error: No field named \"t1.c0\".\n\ Valid fields are t1.\"CapitalColumn\", t1.\"field.with.period\"."; assert_eq!(err.strip_backtrace(), expected); Ok(()) } + #[test] + fn field_not_found_suggests_closest_field_name() -> Result<()> { + let schema = DFSchema::try_from(Schema::new(vec![ + Field::new("abzz", DataType::Boolean, true), + Field::new("abcd", DataType::Boolean, true), + ]))?; + + let err = schema.field_with_unqualified_name("abc").unwrap_err(); + let expected = "Schema error: No field named abc. Did you mean 'abcd'?\n\ + Valid fields are abzz, abcd."; + assert_eq!(err.strip_backtrace(), expected); + Ok(()) + } + + #[test] + fn field_not_found_suggests_case_sensitive_qualified_field() -> Result<()> { + let schema = DFSchema::try_from_qualified_schema( + "hits", + &Schema::new(vec![ + Field::new("WatchID", DataType::Boolean, true), + Field::new("URL", DataType::Boolean, true), + Field::new("URLHash", DataType::Boolean, true), + ]), + )?; + + let err = schema.field_with_unqualified_name("url").unwrap_err(); + let expected = "Schema error: No field named url. Did you mean 'hits.\"URL\"'?\n\ + Column names are case sensitive. \ + You can use double quotes to refer to the hits.\"URL\" column \ + or set the datafusion.sql_parser.enable_ident_normalization configuration.\n\ + Valid fields are hits.\"WatchID\", hits.\"URL\", hits.\"URLHash\"."; + assert_eq!(err.strip_backtrace(), expected); + Ok(()) + } + #[test] fn from_unqualified_schema() -> Result<()> { let schema = DFSchema::try_from(test_schema_1())?; diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index c6c50371c26c1..7003632f24d13 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -45,7 +45,7 @@ use std::io; use std::result; use std::sync::Arc; -use crate::utils::datafusion_strsim::normalized_levenshtein; +use crate::utils::datafusion_strsim::{levenshtein, normalized_levenshtein}; use crate::utils::quote_identifier; use crate::{Column, DFSchema, Diagnostic, TableReference}; use arrow::error::ArrowError; @@ -198,6 +198,71 @@ pub enum SchemaError { }, } +fn case_insensitive_field_match<'a>( + field: &Column, + valid_fields: &'a [Column], +) -> Option<&'a Column> { + let field_flat_name = field.flat_name(); + let field_name_lower = field.name().to_lowercase(); + let field_flat_name_lower = field_flat_name.to_lowercase(); + + valid_fields.iter().find(|valid_field| { + let valid_field_flat_name = valid_field.flat_name(); + let valid_field_name_lower = valid_field.name().to_lowercase(); + let valid_field_flat_name_lower = valid_field_flat_name.to_lowercase(); + + let name_differs_only_by_case = field_name_lower == valid_field_name_lower + && field.name() != valid_field.name(); + let flat_name_differs_only_by_case = field_flat_name_lower + == valid_field_flat_name_lower + && field_flat_name != valid_field_flat_name; + + name_differs_only_by_case || flat_name_differs_only_by_case + }) +} + +fn closest_valid_field<'a>( + field: &Column, + valid_fields: &'a [Column], +) -> Option<&'a Column> { + // Find the most similar valid field name. + let target_names = [ + field.name().to_lowercase(), + field.flat_name().to_lowercase(), + ]; + + let mut best_match: Option<(usize, usize, usize, &Column)> = None; + for (index, valid_field) in valid_fields.iter().enumerate() { + let valid_names = [ + valid_field.name().to_lowercase(), + valid_field.flat_name().to_lowercase(), + ]; + for target in &target_names { + for valid_name in &valid_names { + let distance = levenshtein(target, valid_name); + let max_len = target.chars().count().max(valid_name.chars().count()); + if max_len == 0 || distance * 2 > max_len { + continue; + } + + let should_replace = best_match.is_none_or( + |(best_distance, best_max_len, best_index, _)| { + distance < best_distance + || distance == best_distance + && (max_len > best_max_len + || max_len == best_max_len && index < best_index) + }, + ); + if should_replace { + best_match = Some((distance, max_len, index, valid_field)); + } + } + } + } + + best_match.map(|(_, _, _, valid_field)| valid_field) +} + impl Display for SchemaError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -205,44 +270,39 @@ impl Display for SchemaError { field, valid_fields, } => { + let closest_field = closest_valid_field(field, valid_fields); + let case_sensitive_match = + case_insensitive_field_match(field, valid_fields); + write!(f, "No field named {}", field.quoted_flat_name())?; - let lower_valid_fields = valid_fields - .iter() - .map(|column| column.flat_name().to_lowercase()) - .collect::>(); - - let valid_fields_names = valid_fields - .iter() - .map(|column| column.flat_name()) - .collect::>(); - if lower_valid_fields.contains(&field.flat_name().to_lowercase()) { + if let Some(matched) = closest_field { + write!(f, ". Did you mean '{}'?", matched.quoted_flat_name())?; + } else { + write!(f, ".")?; + } + + if let Some(case_sensitive_match) = case_sensitive_match { write!( f, - ". Column names are case sensitive. You can use double quotes to refer to the \"{}\" column \ - or set the datafusion.sql_parser.enable_ident_normalization configuration", - field.quoted_flat_name() + "\nColumn names are case sensitive. You can use double quotes to refer to the {} column \ + or set the datafusion.sql_parser.enable_ident_normalization configuration.", + case_sensitive_match.quoted_flat_name() )?; } - let field_name = field.name(); - if let Some(matched) = valid_fields_names - .iter() - .filter(|str| normalized_levenshtein(str, field_name) >= 0.5) - .collect::>() - .first() - { - write!(f, ". Did you mean '{matched}'?")?; - } else if !valid_fields.is_empty() { + + if !valid_fields.is_empty() { write!( f, - ". Valid fields are {}", + "\nValid fields are {}.", valid_fields .iter() .map(|field| field.quoted_flat_name()) .collect::>() .join(", ") - )?; + ) + } else { + Ok(()) } - write!(f, ".") } Self::DuplicateQualifiedField { qualifier, name } => { write!( diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index e55a373adab9f..d20d9472cf90a 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1141,7 +1141,13 @@ async fn test_aggregate_name_collision() -> Result<()> { // The select expr has the same display_name as the group_expr, // but since they are different expressions, it should fail. .expect_err("Expected error"); - assert_snapshot!(df.strip_backtrace(), @r#"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."#); + assert_snapshot!( + df.strip_backtrace(), + @r#" +Schema error: No field named aggregate_test_100.c2. +Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3". +"# + ); Ok(()) } @@ -6305,7 +6311,10 @@ async fn test_alias_nested() -> Result<()> { let select2 = df.select(vec![col("alias1.a")]); assert_snapshot!( select2.unwrap_err().strip_backtrace(), - @"Schema error: No field named alias1.a. Valid fields are alias2.a, alias2.b, alias2.one." + @r#" +Schema error: No field named alias1.a. Did you mean 'alias2.a'? +Valid fields are alias2.a, alias2.b, alias2.one. +"# ); Ok(()) } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index eab8114d6910b..a9a0c156538f9 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -483,7 +483,7 @@ mod test { normalize_col_with_schemas_and_ambiguity_check(expr, &[&schemas], &[]) .unwrap_err() .strip_backtrace(); - let expected = "Schema error: No field named b. \ + let expected = "Schema error: No field named b.\n\ Valid fields are \"tableA\".a."; assert_eq!(error, expected); } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 64763e33d93f7..8c5ea71fd2146 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -725,7 +725,7 @@ fn plan_insert_no_target_columns() { )] #[case::non_existing_column( "INSERT INTO test_decimal (nonexistent, price) VALUES (1, 2), (4, 5)", - "Schema error: No field named nonexistent. \ + "Schema error: No field named nonexistent.\n\ Valid fields are id, price." )] #[case::target_column_count_mismatch( @@ -1681,7 +1681,10 @@ fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist() assert_snapshot!( err.strip_backtrace(), - @r#"Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀"."# + @r#" +Schema error: No field named doesnotexist. +Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀". +"# ); } diff --git a/datafusion/sqllogictest/test_files/delete.slt b/datafusion/sqllogictest/test_files/delete.slt index 6131d6db3d5f7..1f33360824393 100644 --- a/datafusion/sqllogictest/test_files/delete.slt +++ b/datafusion/sqllogictest/test_files/delete.slt @@ -79,7 +79,7 @@ physical_plan # Deleting by columns that do not exist returns an error -query error DataFusion error: Schema error: No field named e. Valid fields are t1.a, t1.b, t1.c, t1.d. +query error DataFusion error: Schema error: No field named e\.\nValid fields are t1.a, t1.b, t1.c, t1.d. explain delete from t1 where e = 1; diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index 20c1db5cb1511..ab934279c32ec 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -180,13 +180,13 @@ SELECT DISTINCT - 84 FROM tab0 AS cor0 WHERE NOT + 96 / + col1 <= NULL GROUP BY statement ok create table a(timestamp int, birthday int, ts int, tokens int, amp int, staamp int); -query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\. +query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\. select timetamp from a; -query error DataFusion error: Schema error: No field named dadsada\. Valid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\. +query error DataFusion error: Schema error: No field named dadsada\.\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\. select dadsada from a; -query error DataFusion error: Schema error: No field named ammp\. Did you mean 'a\.amp'\?\. +query error DataFusion error: Schema error: No field named ammp\. Did you mean 'a\.amp'\?\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\. select ammp from a; statement ok diff --git a/datafusion/sqllogictest/test_files/ident_normalization.slt b/datafusion/sqllogictest/test_files/ident_normalization.slt index b1bdb1d882274..e50a21984f24e 100644 --- a/datafusion/sqllogictest/test_files/ident_normalization.slt +++ b/datafusion/sqllogictest/test_files/ident_normalization.slt @@ -75,7 +75,7 @@ A Int64 NO # Expect error as 'a' is not a column -- "A" is and the identifiers # are not normalized -query error DataFusion error: Schema error: No field named a\. Valid fields are x\."A"\. +query error DataFusion error: Schema error: No field named a\. Did you mean 'x\."A"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the x\."A" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are x\."A"\. select a from x; # should work (note the uppercase 'A') diff --git a/datafusion/sqllogictest/test_files/identifiers.slt b/datafusion/sqllogictest/test_files/identifiers.slt index e5eec3bf7f2c0..740a2642f9398 100644 --- a/datafusion/sqllogictest/test_files/identifiers.slt +++ b/datafusion/sqllogictest/test_files/identifiers.slt @@ -90,16 +90,16 @@ drop table case_insensitive_test statement ok CREATE TABLE test("Column1" string) AS VALUES ('content1'); -statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\. +statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\. SELECT COLumn1 from test -statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\. +statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\. SELECT Column1 from test -statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\. +statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\. SELECT column1 from test -statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\. +statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\. SELECT "column1" from test statement ok diff --git a/datafusion/sqllogictest/test_files/join.slt.part b/datafusion/sqllogictest/test_files/join.slt.part index b9d163d877596..00bea008fc2fc 100644 --- a/datafusion/sqllogictest/test_files/join.slt.part +++ b/datafusion/sqllogictest/test_files/join.slt.part @@ -94,7 +94,7 @@ statement ok set datafusion.execution.batch_size = 4096; # left semi with wrong where clause -query error DataFusion error: Schema error: No field named t2\.t2_id\. Did you mean 't1\.t1_id'\?\. +query error DataFusion error: Schema error: No field named t2\.t2_id\. Did you mean 't1\.t1_id'\?\nValid fields are t1\.t1_id, t1\.t1_name, t1\.t1_int\. SELECT t1.t1_id, t1.t1_name, t1.t1_int FROM t1 LEFT SEMI JOIN t2 ON t1.t1_id = t2.t2_id diff --git a/datafusion/sqllogictest/test_files/references.slt b/datafusion/sqllogictest/test_files/references.slt index 0e72c5e5a29e9..146046cffab72 100644 --- a/datafusion/sqllogictest/test_files/references.slt +++ b/datafusion/sqllogictest/test_files/references.slt @@ -66,7 +66,7 @@ CREATE TABLE test("f.c1" TEXT, "test.c2" INT, "...." INT) AS VALUES ('foobar', 2, 20), ('foobaz', 3, 30); -query error DataFusion error: Schema error: No field named f1\.c1\. Valid fields are test\."f\.c1", test\."test\.c2", test\."\.\.\.\."\. +query error DataFusion error: Schema error: No field named f1\.c1\. Did you mean 'test\."f\.c1"'\?\nValid fields are test\."f\.c1", test\."test\.c2", test\."\.\.\.\."\. SELECT f1.c1 FROM test; query T diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 3e97dc4588655..5ac4681f25171 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1181,7 +1181,7 @@ SELECT * FROM empty_table statement ok CREATE TABLE case_sensitive_table("INT32" int) AS VALUES (1), (2), (3), (4), (5); -statement error DataFusion error: Schema error: No field named int32\. Valid fields are case_sensitive_table\."INT32"\. +statement error DataFusion error: Schema error: No field named int32\. Did you mean 'case_sensitive_table\."INT32"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the case_sensitive_table\."INT32" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are case_sensitive_table\."INT32"\. select "int32" from case_sensitive_table query I @@ -1823,7 +1823,7 @@ select a + b from (select 1 as a, 2 as b, 1 as "a + b"); 3 # Can't reference an output column by expression over projection. -query error DataFusion error: Schema error: No field named a\. Valid fields are "a \+ Int64\(1\)"\. +query error DataFusion error: Schema error: No field named a\.\nValid fields are "a \+ Int64\(1\)"\. select a + 1 from (select a+1 from (select 1 as a)); query I @@ -1861,7 +1861,7 @@ statement ok DROP TABLE test; # Can't reference an unqualified column by a qualified name -query error DataFusion error: Schema error: No field named t1\.v1\. Column names are case sensitive\. You can use double quotes to refer to the "t1\.v1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\. Valid fields are "t1\.v1"\. +query error DataFusion error: Schema error: No field named t1\.v1\. Did you mean '"t1\.v1"'\?\nValid fields are "t1\.v1"\. SELECT t1.v1 FROM (SELECT 1 AS "t1.v1"); # Test issue: https://github.com/apache/datafusion/issues/14124 diff --git a/datafusion/sqllogictest/test_files/union_by_name.slt b/datafusion/sqllogictest/test_files/union_by_name.slt index 6a1608d5d1348..dbcaea778c0d9 100644 --- a/datafusion/sqllogictest/test_files/union_by_name.slt +++ b/datafusion/sqllogictest/test_files/union_by_name.slt @@ -124,7 +124,7 @@ NULL 5 # Ambiguous name -statement error DataFusion error: Schema error: No field named x. Valid fields are a, b. +statement error DataFusion error: Schema error: No field named x\.\nValid fields are a, b. SELECT x AS a FROM t1 UNION BY NAME SELECT x AS b FROM t1 ORDER BY x; query II