Skip to content

Commit 0eabd10

Browse files
committed
Further improvements of the extension type API proposal
1 parent 2a48e73 commit 0eabd10

14 files changed

Lines changed: 100 additions & 120 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-examples/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ cargo run --example dataframe -- dataframe
133133

134134
| Subcommand | File Path | Description |
135135
| --- | --- | --- |
136-
| my_id | [`extension_types/event_id.rs`](examples/extension_types/event_id.rs) | A custom wrapper around integers that represent event ids |
136+
| event_id | [`extension_types/event_id.rs`](examples/extension_types/event_id.rs) | A custom wrapper around integers that represent event ids |
137137

138138
## External Dependency Examples
139139

datafusion-examples/examples/extension_types/event_id.rs

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ use datafusion::error::Result;
2424
use datafusion::execution::SessionStateBuilder;
2525
use datafusion::prelude::SessionContext;
2626
use datafusion_common::internal_err;
27-
use datafusion_common::types::{DFExtensionType, DFExtensionTypeRef};
27+
use datafusion_common::types::DFExtensionType;
2828
use datafusion_expr::registry::{
29-
ExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry,
29+
DefaultExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry,
3030
};
3131
use std::fmt::Write;
3232
use std::sync::Arc;
@@ -48,7 +48,9 @@ pub async fn event_id_example() -> Result<()> {
4848
fn create_session_context() -> Result<SessionContext> {
4949
// Create a registry with a reference to the custom extension type implementation.
5050
let registry = MemoryExtensionTypeRegistry::new();
51-
let event_id_registration = Arc::new(EventIdExtensionTypeRegistration {});
51+
let event_id_registration = DefaultExtensionTypeRegistration::new_arc(|metadata| {
52+
Ok(EventIdExtensionType(metadata))
53+
});
5254
registry.add_extension_type_registration(event_id_registration)?;
5355

5456
// Set the extension type registry in the session state so that DataFusion can use it.
@@ -104,8 +106,8 @@ fn example_schema() -> SchemaRef {
104106
]))
105107
}
106108

107-
/// Represents a 32-bit custom identifier that represents a single event. Using this format is not
108-
/// a good idea in practice, but it is useful for demonstrating the API usage.
109+
/// Represents a 32-bit custom identifier that represents a single event. Using this format is
110+
/// probably not a good idea in practice, but it is useful for demonstrating the API usage.
109111
///
110112
/// An event is constructed of three parts:
111113
/// - The year
@@ -274,30 +276,6 @@ impl DisplayIndex for EventIdDisplayIndex<'_> {
274276
}
275277
}
276278

277-
/// The registration is the last piece missing for the extension type implementation. It contains
278-
/// the logic for deserializing the metadata from the arrow [`Field`]s and creating the extension
279-
/// type instance. We cannot use the trait from arrow-rs as it's not dyn-compatible (the Metadata
280-
/// type must be known at compile time).
281-
///
282-
/// If an extension type does not have any parameters, the [`SimpleExtensionTypeRegistration`]
283-
/// provides an easier way of registering it.
284-
#[derive(Debug)]
285-
pub struct EventIdExtensionTypeRegistration();
286-
287-
impl ExtensionTypeRegistration for EventIdExtensionTypeRegistration {
288-
fn type_name(&self) -> &str {
289-
EventIdExtensionType::NAME
290-
}
291-
292-
fn create_df_extension_type(
293-
&self,
294-
metadata: Option<&str>,
295-
) -> Result<DFExtensionTypeRef> {
296-
let metadata = EventIdExtensionType::deserialize_metadata(metadata)?;
297-
Ok(Arc::new(EventIdExtensionType(metadata)))
298-
}
299-
}
300-
301279
#[cfg(test)]
302280
mod tests {
303281
use super::*;

datafusion-examples/examples/extension_types/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121
//!
2222
//! ## Usage
2323
//! ```bash
24-
//! cargo run --example dataframe -- [all|my_id]
24+
//! cargo run --example extension_types -- [all|event_id]
2525
//! ```
2626
//!
2727
//! Each subcommand runs a corresponding example:
2828
//! - `all` — run all examples included in this module
2929
//!
30-
//! - `my_id`
30+
//! - `event_id`
3131
//! (file: event_id.rs, desc: A custom wrapper around integers that represent event ids)
3232
3333
mod event_id;

datafusion/common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ apache-avro = { workspace = true, features = [
6666
"zstandard",
6767
], optional = true }
6868
arrow = { workspace = true }
69+
arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
6970
arrow-ipc = { workspace = true }
7071
chrono = { workspace = true }
7172
half = { workspace = true }
Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
11
mod uuid;
2-
3-
pub use uuid::*;

datafusion/common/src/types/canonical_extensions/uuid.rs

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,7 @@ use uuid::{Bytes, Uuid};
99
/// Defines the extension type logic for the canonical `arrow.uuid` extension type.
1010
///
1111
/// See [`DFExtensionType`] for information on DataFusion's extension type mechanism.
12-
#[derive(Debug)]
13-
pub struct UuidDFExtensionType();
14-
15-
impl UuidDFExtensionType {
16-
/// Create a new instance of [`UuidDFExtensionType`].
17-
pub fn new() -> Self {
18-
Self {}
19-
}
20-
}
21-
22-
impl Default for UuidDFExtensionType {
23-
fn default() -> Self {
24-
Self::new()
25-
}
26-
}
27-
28-
impl DFExtensionType for UuidDFExtensionType {
12+
impl DFExtensionType for arrow_schema::extension::Uuid {
2913
fn create_array_formatter<'fmt>(
3014
&self,
3115
array: &'fmt dyn Array,
@@ -56,14 +40,14 @@ struct UuidValueDisplayIndex<'a> {
5640
impl DisplayIndex for UuidValueDisplayIndex<'_> {
5741
fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult {
5842
if self.array.is_null(idx) {
59-
write!(f, "arrow.uuid({})", self.null_str)?;
43+
write!(f, "{}", self.null_str)?;
6044
return Ok(());
6145
}
6246

6347
let bytes = Bytes::try_from(self.array.value(idx))
6448
.expect("FixedSizeBinaryArray length checked in create_array_formatter");
6549
let uuid = Uuid::from_bytes(bytes);
66-
write!(f, "arrow.uuid({uuid})")?;
50+
write!(f, "{uuid}")?;
6751
Ok(())
6852
}
6953
}
@@ -88,7 +72,7 @@ mod tests {
8872

8973
assert_eq!(
9074
formatter.value(0).to_string(),
91-
"arrow.uuid(00000000-0000-0000-0000-000000000000)"
75+
"00000000-0000-0000-0000-000000000000"
9276
);
9377
}
9478
}

datafusion/common/src/types/extension.rs

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,36 +7,38 @@ use std::sync::Arc;
77
/// A cheaply cloneable pointer to a [`DFExtensionType`].
88
pub type DFExtensionTypeRef = Arc<dyn DFExtensionType>;
99

10-
/// Represents an implementation of a DataFusion extension type, allowing users to customize the
11-
/// behavior of DataFusion for custom extension types.
12-
///
13-
/// Extension types may change the semantics of a column. For example, adding two values of
14-
/// [`DataType::Int64`] is a sensible thing to do. However, if the same data type is annotated with
15-
/// an extension type like `custom.id`, the correct interpretation of a column changes. For example,
16-
/// adding together two `custom.id` values (represented as a 64-bit integer) may no longer make
17-
/// sense.
18-
///
19-
/// Note that while helping users to navigate the semantic gap between the data type and extension
20-
/// types is a goal of this trait, DataFusion's extension type support is still evolving and does
21-
/// not cover all use cases. Currently, the following capabilities can be customized:
10+
/// Represents an implementation of a DataFusion extension type.
11+
///
12+
/// This allows users to customize the behavior of DataFusion for certain types. Having this ability
13+
/// is necessary because extension types affect how columns should be treated by the query engine.
14+
/// This effect includes which operations are possible on a column and what are the expected results
15+
/// from these operations. The extension type mechanism allows users to define how these operations
16+
/// apply to a particular extension type.
17+
///
18+
/// For example, adding two values of [`DataType::Int64`] is a sensible thing to do. However, if the
19+
/// same column is annotated with an extension type like `custom.id`, the correct interpretation of
20+
/// a column changes. Adding together two `custom.id` values, even though they are stored as
21+
/// integers, may no longer make sense.
22+
///
23+
/// Note that DataFusion's extension type support is still young and therefore might not cover all
24+
/// relevant use cases. Currently, the following operations can be customized:
2225
/// - Pretty-printing values in record batches
2326
///
2427
/// # Relation to Arrow's `ExtensionType`
2528
///
26-
/// The purpose of Arrow's `ExtensionType` trait, for the time being, is to provide a way to handle
27-
/// metadata of an extension type in a type-safe manner. The trait does not provide any
28-
/// customization options such that users can customize the behavior of any kernels (e.g.,
29-
/// [`DFExtensionType::create_array_formatter`] for formatting record batches). Therefore,
30-
/// downstream users (such as DataFusion) have the flexibility to implement the extension type
31-
/// mechanism according to their needs. [`DFExtensionType`] is DataFusion's implementation of this
32-
/// extension type mechanism.
29+
/// The purpose of Arrow's `ExtensionType` trait, for the time being, is to allow reading and
30+
/// writing extension type metadata in a type-safe manner. The trait does not provide any
31+
/// customization options. Therefore, downstream users (such as DataFusion) have the flexibility to
32+
/// implement the extension type mechanism according to their needs. [`DFExtensionType`] is
33+
/// DataFusion's implementation of this extension type mechanism.
3334
///
34-
/// Furthermore, Arrow's current trait is not dyn-compatible which we need for implementing
35+
/// Furthermore, the current trait in arrow-rs is not dyn-compatible, which we need for implementing
3536
/// extension type registries. In the future, the two implementations may increasingly converge.
3637
///
37-
/// # Example
38-
///
38+
/// # Examples
3939
///
40+
/// Examples for using the extension type machinery can be found in the DataFusion examples
41+
/// directory.
4042
pub trait DFExtensionType: Debug + Send + Sync {
4143
/// Returns an [`ArrayFormatter`] that can format values of this type.
4244
///

datafusion/common/src/types/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ mod logical;
2323
mod native;
2424

2525
pub use builtin::*;
26-
pub use canonical_extensions::*;
2726
pub use extension::*;
2827
pub use field::*;
2928
pub use logical::*;

datafusion/core/src/execution/session_state.rs

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ use crate::datasource::provider_as_source;
3030
use crate::execution::SessionStateDefaults;
3131
use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
3232
use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
33-
use arrow_schema::extension::ExtensionType;
3433
use arrow_schema::{DataType, FieldRef};
3534
use datafusion_catalog::MemoryCatalogProviderList;
3635
use datafusion_catalog::information_schema::{
@@ -58,9 +57,9 @@ use datafusion_expr::planner::ExprPlanner;
5857
#[cfg(feature = "sql")]
5958
use datafusion_expr::planner::{RelationPlanner, TypePlanner};
6059
use datafusion_expr::registry::{
61-
ExtensionTypeRegistration, ExtensionTypeRegistrationRef, ExtensionTypeRegistry,
62-
ExtensionTypeRegistryRef, FunctionRegistry, MemoryExtensionTypeRegistry,
63-
SerializerRegistry, SimpleExtensionTypeRegistration,
60+
DefaultExtensionTypeRegistration, ExtensionTypeRegistration,
61+
ExtensionTypeRegistrationRef, ExtensionTypeRegistry, ExtensionTypeRegistryRef,
62+
FunctionRegistry, MemoryExtensionTypeRegistry, SerializerRegistry,
6463
};
6564
use datafusion_expr::simplify::SimplifyContext;
6665
use datafusion_expr::{AggregateUDF, Explain, Expr, LogicalPlan, ScalarUDF, WindowUDF};
@@ -82,7 +81,6 @@ use datafusion_sql::{
8281

8382
use async_trait::async_trait;
8483
use chrono::{DateTime, Utc};
85-
use datafusion_common::types::UuidDFExtensionType;
8684
use itertools::Itertools;
8785
use log::{debug, info};
8886
use object_store::ObjectStore;
@@ -1353,10 +1351,10 @@ impl SessionStateBuilder {
13531351
/// May fail if an already registered [`ExtensionTypeRegistry`] raises an error while
13541352
/// registering the canonical extension types.
13551353
pub fn with_canonical_extension_types(mut self) -> datafusion_common::Result<Self> {
1356-
let canonical_extension_types = vec![SimpleExtensionTypeRegistration::new_arc(
1357-
arrow_schema::extension::Uuid::NAME,
1358-
Arc::new(UuidDFExtensionType::new()),
1359-
)];
1354+
let uuid = DefaultExtensionTypeRegistration::new_arc(|_| {
1355+
Ok(arrow_schema::extension::Uuid {})
1356+
});
1357+
let canonical_extension_types = vec![uuid];
13601358

13611359
match &self.extension_types {
13621360
None => {

0 commit comments

Comments
 (0)