From 2e616f182046d4ebbd40b7f428f6cfa2b22ae75f Mon Sep 17 00:00:00 2001 From: Joseph Koshakow Date: Mon, 7 Jul 2025 07:35:38 -0400 Subject: [PATCH 1/2] Add support for Arrow Dictionary type in Substrait (#16608) * Add support for Arrow Dictionary type in Substrait This commit adds support for the Arrow Dictionary type in Substrait plans. Resolves #16273 * Add more specific type variation consts (cherry picked from commit d359d6496168da59e6ac4bfea30e648674382f87) --- .../src/logical_plan/consumer/types.rs | 45 +++++++++++-------- .../src/logical_plan/producer/types.rs | 21 ++++++++- datafusion/substrait/src/variation_const.rs | 2 + 3 files changed, 48 insertions(+), 20 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer/types.rs b/datafusion/substrait/src/logical_plan/consumer/types.rs index 7bc30e433d868..87bebe8ddd823 100644 --- a/datafusion/substrait/src/logical_plan/consumer/types.rs +++ b/datafusion/substrait/src/logical_plan/consumer/types.rs @@ -21,7 +21,8 @@ use super::SubstraitConsumer; use crate::variation_const::{ DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF, DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF, - DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, + DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_MAP_TYPE_VARIATION_REF, + DEFAULT_TYPE_VARIATION_REF, DICTIONARY_MAP_TYPE_VARIATION_REF, INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_NAME, INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF, @@ -180,24 +181,32 @@ pub fn from_substrait_type( let value_type = map.value.as_ref().ok_or_else(|| { substrait_datafusion_err!("Map type must have value type") })?; - let key_field = Arc::new(Field::new( - "key", - from_substrait_type(consumer, key_type, dfs_names, name_idx)?, - false, - )); - let value_field = Arc::new(Field::new( - "value", - from_substrait_type(consumer, value_type, dfs_names, name_idx)?, - true, - )); - Ok(DataType::Map( - Arc::new(Field::new_struct( - "entries", - [key_field, value_field], - false, // The inner map field is always non-nullable (Arrow #1697), + let key_type = + from_substrait_type(consumer, key_type, dfs_names, name_idx)?; + let value_type = + from_substrait_type(consumer, value_type, dfs_names, name_idx)?; + + match map.type_variation_reference { + DEFAULT_MAP_TYPE_VARIATION_REF => { + let key_field = Arc::new(Field::new("key", key_type, false)); + let value_field = Arc::new(Field::new("value", value_type, true)); + Ok(DataType::Map( + Arc::new(Field::new_struct( + "entries", + [key_field, value_field], + false, // The inner map field is always non-nullable (Arrow #1697), + )), + false, // whether keys are sorted + )) + } + DICTIONARY_MAP_TYPE_VARIATION_REF => Ok(DataType::Dictionary( + Box::new(key_type), + Box::new(value_type), )), - false, // whether keys are sorted - )) + v => not_impl_err!( + "Unsupported Substrait type variation {v} of type {s_kind:?}" + ), + } } r#type::Kind::Decimal(d) => match d.type_variation_reference { DECIMAL_128_TYPE_VARIATION_REF => { diff --git a/datafusion/substrait/src/logical_plan/producer/types.rs b/datafusion/substrait/src/logical_plan/producer/types.rs index 5762cc76b0c8a..9f33415e8216b 100644 --- a/datafusion/substrait/src/logical_plan/producer/types.rs +++ b/datafusion/substrait/src/logical_plan/producer/types.rs @@ -21,7 +21,8 @@ use crate::variation_const::TIMESTAMP_NANO_TYPE_VARIATION_REF; use crate::variation_const::{ DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF, DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF, - DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, + DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_MAP_TYPE_VARIATION_REF, + DEFAULT_TYPE_VARIATION_REF, DICTIONARY_MAP_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF, }; @@ -235,13 +236,25 @@ pub(crate) fn to_substrait_type( kind: Some(r#type::Kind::Map(Box::new(r#type::Map { key: Some(Box::new(key_type)), value: Some(Box::new(value_type)), - type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF, + type_variation_reference: DEFAULT_MAP_TYPE_VARIATION_REF, nullability, }))), }) } _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), }, + DataType::Dictionary(key_type, value_type) => { + let key_type = to_substrait_type(key_type, nullable)?; + let value_type = to_substrait_type(value_type, nullable)?; + Ok(substrait::proto::Type { + kind: Some(r#type::Kind::Map(Box::new(r#type::Map { + key: Some(Box::new(key_type)), + value: Some(Box::new(value_type)), + type_variation_reference: DICTIONARY_MAP_TYPE_VARIATION_REF, + nullability, + }))), + }) + } DataType::Struct(fields) => { let field_types = fields .iter() @@ -365,6 +378,10 @@ mod tests { .into(), false, ))?; + round_trip_type(DataType::Dictionary( + Box::new(DataType::Utf8), + Box::new(DataType::Int32), + ))?; round_trip_type(DataType::Struct( vec![ diff --git a/datafusion/substrait/src/variation_const.rs b/datafusion/substrait/src/variation_const.rs index e5bebf8e11819..49ea918980f71 100644 --- a/datafusion/substrait/src/variation_const.rs +++ b/datafusion/substrait/src/variation_const.rs @@ -53,6 +53,8 @@ pub const DATE_64_TYPE_VARIATION_REF: u32 = 1; pub const DEFAULT_CONTAINER_TYPE_VARIATION_REF: u32 = 0; pub const LARGE_CONTAINER_TYPE_VARIATION_REF: u32 = 1; pub const VIEW_CONTAINER_TYPE_VARIATION_REF: u32 = 2; +pub const DEFAULT_MAP_TYPE_VARIATION_REF: u32 = 0; +pub const DICTIONARY_MAP_TYPE_VARIATION_REF: u32 = 1; pub const DECIMAL_128_TYPE_VARIATION_REF: u32 = 0; pub const DECIMAL_256_TYPE_VARIATION_REF: u32 = 1; From e82ca66d36c96adfaf6d6e658a03b4dbcf6bf453 Mon Sep 17 00:00:00 2001 From: LiaCastaneda Date: Mon, 28 Jul 2025 18:05:30 +0200 Subject: [PATCH 2/2] Remove DataDog Specific Workaround --- datafusion/substrait/src/logical_plan/producer/types.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/producer/types.rs b/datafusion/substrait/src/logical_plan/producer/types.rs index 9f33415e8216b..0c466dd2233a4 100644 --- a/datafusion/substrait/src/logical_plan/producer/types.rs +++ b/datafusion/substrait/src/logical_plan/producer/types.rs @@ -284,8 +284,6 @@ pub(crate) fn to_substrait_type( precision: *p as i32, })), }), - // TODO: DataDog-specific workaround, don't commit upstream - DataType::Dictionary(_, dt) => to_substrait_type(dt, nullable), _ => not_impl_err!("Unsupported cast type: {dt:?}"), } }