Skip to content

Commit 18b2aaa

Browse files
jayzhan211alamb
andauthored
Infer data type from schema for Values and add struct coercion to coalesce (#12864)
* first draft Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * cleanup Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * add values table without schema Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * cleanup Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * fmt Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * rm unused import Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * fmt Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * use option instead of vec<err> Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * Fix clippy * add values back and rename Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * invalid query Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * use values if no schema Signed-off-by: jayzhan211 <jayzhan211@gmail.com> * add doc Signed-off-by: jayzhan211 <jayzhan211@gmail.com> --------- Signed-off-by: jayzhan211 <jayzhan211@gmail.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent de526a9 commit 18b2aaa

20 files changed

Lines changed: 368 additions & 126 deletions

File tree

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/src/dfschema.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,6 @@ impl DFSchema {
315315
None => self_unqualified_names.contains(field.name().as_str()),
316316
};
317317
if !duplicated_field {
318-
// self.inner.fields.push(field.clone());
319318
schema_builder.push(Arc::clone(field));
320319
qualifiers.push(qualifier.cloned());
321320
}

datafusion/expr-common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,5 @@ path = "src/lib.rs"
4040
[dependencies]
4141
arrow = { workspace = true }
4242
datafusion-common = { workspace = true }
43+
itertools = { workspace = true }
4344
paste = "^1.0"

datafusion/expr-common/src/type_coercion/binary.rs

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ use arrow::datatypes::{
2828
DataType, Field, FieldRef, Fields, TimeUnit, DECIMAL128_MAX_PRECISION,
2929
DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
3030
};
31-
use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result};
31+
use datafusion_common::{
32+
exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, Result,
33+
};
34+
use itertools::Itertools;
3235

3336
/// The type signature of an instantiation of binary operator expression such as
3437
/// `lhs + rhs`
@@ -372,6 +375,8 @@ impl From<&DataType> for TypeCategory {
372375
/// decimal precision and scale when coercing decimal types.
373376
///
374377
/// This function doesn't preserve correct field name and nullability for the struct type, we only care about data type.
378+
///
379+
/// Returns Option because we might want to continue on the code even if the data types are not coercible to the common type
375380
pub fn type_union_resolution(data_types: &[DataType]) -> Option<DataType> {
376381
if data_types.is_empty() {
377382
return None;
@@ -529,6 +534,89 @@ fn type_union_resolution_coercion(
529534
}
530535
}
531536

537+
/// Handle type union resolution including struct type and others.
538+
pub fn try_type_union_resolution(data_types: &[DataType]) -> Result<Vec<DataType>> {
539+
let err = match try_type_union_resolution_with_struct(data_types) {
540+
Ok(struct_types) => return Ok(struct_types),
541+
Err(e) => Some(e),
542+
};
543+
544+
if let Some(new_type) = type_union_resolution(data_types) {
545+
Ok(vec![new_type; data_types.len()])
546+
} else {
547+
exec_err!("Fail to find the coerced type, errors: {:?}", err)
548+
}
549+
}
550+
551+
// Handle struct where we only change the data type but preserve the field name and nullability.
552+
// Since field name is the key of the struct, so it shouldn't be updated to the common column name like "c0" or "c1"
553+
pub fn try_type_union_resolution_with_struct(
554+
data_types: &[DataType],
555+
) -> Result<Vec<DataType>> {
556+
let mut keys_string: Option<String> = None;
557+
for data_type in data_types {
558+
if let DataType::Struct(fields) = data_type {
559+
let keys = fields.iter().map(|f| f.name().to_owned()).join(",");
560+
if let Some(ref k) = keys_string {
561+
if *k != keys {
562+
return exec_err!("Expect same keys for struct type but got mismatched pair {} and {}", *k, keys);
563+
}
564+
} else {
565+
keys_string = Some(keys);
566+
}
567+
} else {
568+
return exec_err!("Expect to get struct but got {}", data_type);
569+
}
570+
}
571+
572+
let mut struct_types: Vec<DataType> = if let DataType::Struct(fields) = &data_types[0]
573+
{
574+
fields.iter().map(|f| f.data_type().to_owned()).collect()
575+
} else {
576+
return internal_err!("Struct type is checked is the previous function, so this should be unreachable");
577+
};
578+
579+
for data_type in data_types.iter().skip(1) {
580+
if let DataType::Struct(fields) = data_type {
581+
let incoming_struct_types: Vec<DataType> =
582+
fields.iter().map(|f| f.data_type().to_owned()).collect();
583+
// The order of field is verified above
584+
for (lhs_type, rhs_type) in
585+
struct_types.iter_mut().zip(incoming_struct_types.iter())
586+
{
587+
if let Some(coerced_type) =
588+
type_union_resolution_coercion(lhs_type, rhs_type)
589+
{
590+
*lhs_type = coerced_type;
591+
} else {
592+
return exec_err!(
593+
"Fail to find the coerced type for {} and {}",
594+
lhs_type,
595+
rhs_type
596+
);
597+
}
598+
}
599+
} else {
600+
return exec_err!("Expect to get struct but got {}", data_type);
601+
}
602+
}
603+
604+
let mut final_struct_types = vec![];
605+
for s in data_types {
606+
let mut new_fields = vec![];
607+
if let DataType::Struct(fields) = s {
608+
for (i, f) in fields.iter().enumerate() {
609+
let field = Arc::unwrap_or_clone(Arc::clone(f))
610+
.with_data_type(struct_types[i].to_owned());
611+
new_fields.push(Arc::new(field));
612+
}
613+
}
614+
final_struct_types.push(DataType::Struct(new_fields.into()))
615+
}
616+
617+
Ok(final_struct_types)
618+
}
619+
532620
/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a
533621
/// comparison operation
534622
///

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@ use crate::{
4646

4747
use super::dml::InsertOp;
4848
use super::plan::ColumnUnnestList;
49+
use arrow::compute::can_cast_types;
4950
use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
5051
use datafusion_common::display::ToStringifiedPlan;
5152
use datafusion_common::file_options::file_type::FileType;
5253
use datafusion_common::{
53-
get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err,
54-
plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, FunctionalDependencies,
55-
Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions,
54+
exec_err, get_target_functional_dependencies, internal_err, not_impl_err,
55+
plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError,
56+
FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema,
57+
UnnestOptions,
5658
};
5759
use datafusion_expr_common::type_coercion::binary::type_union_resolution;
5860

@@ -172,12 +174,45 @@ impl LogicalPlanBuilder {
172174
/// `value`. See the [Postgres VALUES](https://www.postgresql.org/docs/current/queries-values.html)
173175
/// documentation for more details.
174176
///
177+
/// so it's usually better to override the default names with a table alias list.
178+
///
179+
/// If the values include params/binders such as $1, $2, $3, etc, then the `param_data_types` should be provided.
180+
pub fn values(values: Vec<Vec<Expr>>) -> Result<Self> {
181+
if values.is_empty() {
182+
return plan_err!("Values list cannot be empty");
183+
}
184+
let n_cols = values[0].len();
185+
if n_cols == 0 {
186+
return plan_err!("Values list cannot be zero length");
187+
}
188+
for (i, row) in values.iter().enumerate() {
189+
if row.len() != n_cols {
190+
return plan_err!(
191+
"Inconsistent data length across values list: got {} values in row {} but expected {}",
192+
row.len(),
193+
i,
194+
n_cols
195+
);
196+
}
197+
}
198+
199+
// Infer from data itself
200+
Self::infer_data(values)
201+
}
202+
203+
/// Create a values list based relation, and the schema is inferred from data itself or table schema if provided, consuming
204+
/// `value`. See the [Postgres VALUES](https://www.postgresql.org/docs/current/queries-values.html)
205+
/// documentation for more details.
206+
///
175207
/// By default, it assigns the names column1, column2, etc. to the columns of a VALUES table.
176208
/// The column names are not specified by the SQL standard and different database systems do it differently,
177209
/// so it's usually better to override the default names with a table alias list.
178210
///
179211
/// If the values include params/binders such as $1, $2, $3, etc, then the `param_data_types` should be provided.
180-
pub fn values(mut values: Vec<Vec<Expr>>) -> Result<Self> {
212+
pub fn values_with_schema(
213+
values: Vec<Vec<Expr>>,
214+
schema: &DFSchemaRef,
215+
) -> Result<Self> {
181216
if values.is_empty() {
182217
return plan_err!("Values list cannot be empty");
183218
}
@@ -196,16 +231,53 @@ impl LogicalPlanBuilder {
196231
}
197232
}
198233

199-
let empty_schema = DFSchema::empty();
234+
// Check the type of value against the schema
235+
Self::infer_values_from_schema(values, schema)
236+
}
237+
238+
fn infer_values_from_schema(
239+
values: Vec<Vec<Expr>>,
240+
schema: &DFSchema,
241+
) -> Result<Self> {
242+
let n_cols = values[0].len();
243+
let mut field_types: Vec<DataType> = Vec::with_capacity(n_cols);
244+
for j in 0..n_cols {
245+
let field_type = schema.field(j).data_type();
246+
for row in values.iter() {
247+
let value = &row[j];
248+
let data_type = value.get_type(schema)?;
249+
250+
if !data_type.equals_datatype(field_type) {
251+
if can_cast_types(&data_type, field_type) {
252+
} else {
253+
return exec_err!(
254+
"type mistmatch and can't cast to got {} and {}",
255+
data_type,
256+
field_type
257+
);
258+
}
259+
}
260+
}
261+
field_types.push(field_type.to_owned());
262+
}
263+
264+
Self::infer_inner(values, &field_types, schema)
265+
}
266+
267+
fn infer_data(values: Vec<Vec<Expr>>) -> Result<Self> {
268+
let n_cols = values[0].len();
269+
let schema = DFSchema::empty();
270+
200271
let mut field_types: Vec<DataType> = Vec::with_capacity(n_cols);
201272
for j in 0..n_cols {
202273
let mut common_type: Option<DataType> = None;
203274
for (i, row) in values.iter().enumerate() {
204275
let value = &row[j];
205-
let data_type = value.get_type(&empty_schema)?;
276+
let data_type = value.get_type(&schema)?;
206277
if data_type == DataType::Null {
207278
continue;
208279
}
280+
209281
if let Some(prev_type) = common_type {
210282
// get common type of each column values.
211283
let data_types = vec![prev_type.clone(), data_type.clone()];
@@ -221,14 +293,22 @@ impl LogicalPlanBuilder {
221293
// since the code loop skips NULL
222294
field_types.push(common_type.unwrap_or(DataType::Null));
223295
}
296+
297+
Self::infer_inner(values, &field_types, &schema)
298+
}
299+
300+
fn infer_inner(
301+
mut values: Vec<Vec<Expr>>,
302+
field_types: &[DataType],
303+
schema: &DFSchema,
304+
) -> Result<Self> {
224305
// wrap cast if data type is not same as common type.
225306
for row in &mut values {
226307
for (j, field_type) in field_types.iter().enumerate() {
227308
if let Expr::Literal(ScalarValue::Null) = row[j] {
228309
row[j] = Expr::Literal(ScalarValue::try_from(field_type)?);
229310
} else {
230-
row[j] =
231-
std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?;
311+
row[j] = std::mem::take(&mut row[j]).cast_to(field_type, schema)?;
232312
}
233313
}
234314
}
@@ -243,6 +323,7 @@ impl LogicalPlanBuilder {
243323
.collect::<Vec<_>>();
244324
let dfschema = DFSchema::from_unqualified_fields(fields.into(), HashMap::new())?;
245325
let schema = DFSchemaRef::new(dfschema);
326+
246327
Ok(Self::new(LogicalPlan::Values(Values { schema, values })))
247328
}
248329

datafusion/functions-nested/src/make_array.rs

Lines changed: 14 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ use arrow_array::{
2828
use arrow_buffer::OffsetBuffer;
2929
use arrow_schema::DataType::{LargeList, List, Null};
3030
use arrow_schema::{DataType, Field};
31-
use datafusion_common::{exec_err, internal_err};
3231
use datafusion_common::{plan_err, utils::array_into_list_array_nullable, Result};
33-
use datafusion_expr::binary::type_union_resolution;
32+
use datafusion_expr::binary::{
33+
try_type_union_resolution_with_struct, type_union_resolution,
34+
};
3435
use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY;
3536
use datafusion_expr::TypeSignature;
3637
use datafusion_expr::{
3738
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
3839
};
39-
use itertools::Itertools;
4040

4141
use crate::utils::make_scalar_function;
4242

@@ -111,33 +111,16 @@ impl ScalarUDFImpl for MakeArray {
111111
}
112112

113113
fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
114-
if let Some(new_type) = type_union_resolution(arg_types) {
115-
// TODO: Move the logic to type_union_resolution if this applies to other functions as well
116-
// Handle struct where we only change the data type but preserve the field name and nullability.
117-
// Since field name is the key of the struct, so it shouldn't be updated to the common column name like "c0" or "c1"
118-
let is_struct_and_has_same_key = are_all_struct_and_have_same_key(arg_types)?;
119-
if is_struct_and_has_same_key {
120-
let data_types: Vec<_> = if let DataType::Struct(fields) = &arg_types[0] {
121-
fields.iter().map(|f| f.data_type().to_owned()).collect()
122-
} else {
123-
return internal_err!("Struct type is checked is the previous function, so this should be unreachable");
124-
};
125-
126-
let mut final_struct_types = vec![];
127-
for s in arg_types {
128-
let mut new_fields = vec![];
129-
if let DataType::Struct(fields) = s {
130-
for (i, f) in fields.iter().enumerate() {
131-
let field = Arc::unwrap_or_clone(Arc::clone(f))
132-
.with_data_type(data_types[i].to_owned());
133-
new_fields.push(Arc::new(field));
134-
}
135-
}
136-
final_struct_types.push(DataType::Struct(new_fields.into()))
137-
}
138-
return Ok(final_struct_types);
114+
let mut errors = vec![];
115+
match try_type_union_resolution_with_struct(arg_types) {
116+
Ok(r) => return Ok(r),
117+
Err(e) => {
118+
errors.push(e);
139119
}
120+
}
140121

122+
if let Some(new_type) = type_union_resolution(arg_types) {
123+
// TODO: Move FixedSizeList to List in type_union_resolution
141124
if let DataType::FixedSizeList(field, _) = new_type {
142125
Ok(vec![DataType::List(field); arg_types.len()])
143126
} else if new_type.is_null() {
@@ -147,9 +130,10 @@ impl ScalarUDFImpl for MakeArray {
147130
}
148131
} else {
149132
plan_err!(
150-
"Fail to find the valid type between {:?} for {}",
133+
"Fail to find the valid type between {:?} for {}, errors are {:?}",
151134
arg_types,
152-
self.name()
135+
self.name(),
136+
errors
153137
)
154138
}
155139
}
@@ -188,26 +172,6 @@ fn get_make_array_doc() -> &'static Documentation {
188172
})
189173
}
190174

191-
fn are_all_struct_and_have_same_key(data_types: &[DataType]) -> Result<bool> {
192-
let mut keys_string: Option<String> = None;
193-
for data_type in data_types {
194-
if let DataType::Struct(fields) = data_type {
195-
let keys = fields.iter().map(|f| f.name().to_owned()).join(",");
196-
if let Some(ref k) = keys_string {
197-
if *k != keys {
198-
return exec_err!("Expect same keys for struct type but got mismatched pair {} and {}", *k, keys);
199-
}
200-
} else {
201-
keys_string = Some(keys);
202-
}
203-
} else {
204-
return Ok(false);
205-
}
206-
}
207-
208-
Ok(true)
209-
}
210-
211175
// Empty array is a special case that is useful for many other array functions
212176
pub(super) fn empty_array_type() -> DataType {
213177
DataType::List(Arc::new(Field::new("item", DataType::Int64, true)))

0 commit comments

Comments
 (0)