Skip to content

Commit d778eb5

Browse files
committed
Improve docs for DataFusion integration
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 4a5b7d7 commit d778eb5

11 files changed

Lines changed: 959 additions & 327 deletions

File tree

vortex-datafusion/src/lib.rs

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,87 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
//! Connectors to enable [DataFusion](https://docs.rs/datafusion/latest/datafusion/) to read [`Vortex`](https://docs.rs/crate/vortex/latest) data.
4+
//! Integrations between [`Vortex`] and [DataFusion].
5+
//!
6+
//! The crate exposes two main entry points:
7+
//!
8+
//! - [`VortexFormatFactory`] for the file-based integration used by SQL,
9+
//! `CREATE EXTERNAL TABLE`, and
10+
//! [`ListingTable`].
11+
//! - [`v2`] for direct integration from an existing Vortex
12+
//! [`DataSourceRef`].
13+
//!
14+
//! # Registering The File Format
15+
//!
16+
//! Most applications register [`VortexFormatFactory`] with a DataFusion
17+
//! [`SessionContext`] and then let DataFusion create [`VortexFormat`] and
18+
//! [`VortexSource`] instances as queries are planned:
19+
//!
20+
//! ```no_run
21+
//! use std::sync::Arc;
22+
//!
23+
//! use datafusion::datasource::provider::DefaultTableFactory;
24+
//! use datafusion::execution::SessionStateBuilder;
25+
//! use datafusion::prelude::SessionContext;
26+
//! use datafusion_common::GetExt;
27+
//! use vortex_datafusion::VortexFormatFactory;
28+
//!
29+
//! # #[tokio::main]
30+
//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
31+
//! let factory = Arc::new(VortexFormatFactory::new());
32+
//! let mut state_builder = SessionStateBuilder::new()
33+
//! .with_default_features()
34+
//! .with_table_factory(
35+
//! factory.get_ext().to_uppercase(),
36+
//! Arc::new(DefaultTableFactory::new()),
37+
//! );
38+
//!
39+
//! if let Some(file_formats) = state_builder.file_formats() {
40+
//! file_formats.push(factory.clone() as _);
41+
//! }
42+
//!
43+
//! let ctx = SessionContext::new_with_state(state_builder.build()).enable_url_table();
44+
//! ctx.sql(
45+
//! "CREATE EXTERNAL TABLE metrics (service VARCHAR, value BIGINT) \
46+
//! STORED AS vortex LOCATION 'file:///tmp/metrics/'",
47+
//! )
48+
//! .await?;
49+
//! # Ok(())
50+
//! # }
51+
//! ```
52+
//!
53+
//! # Registering An Existing Vortex Data Source
54+
//!
55+
//! If your application already has a Vortex [`DataSourceRef`], use
56+
//! [`v2::VortexTable`] to register it directly with DataFusion:
57+
//!
58+
//! ```no_run
59+
//! use std::sync::Arc;
60+
//!
61+
//! use arrow_schema::Schema;
62+
//! use datafusion::prelude::SessionContext;
63+
//! use vortex::VortexSessionDefault;
64+
//! use vortex::scan::DataSourceRef;
65+
//! use vortex::session::VortexSession;
66+
//! use vortex_datafusion::v2::VortexTable;
67+
//!
68+
//! # let data_source: DataSourceRef = todo!();
69+
//! let table = Arc::new(VortexTable::new(
70+
//! data_source,
71+
//! VortexSession::default(),
72+
//! Arc::new(Schema::empty()),
73+
//! ));
74+
//!
75+
//! let ctx = SessionContext::new();
76+
//! ctx.register_table("vortex_data", table)?;
77+
//! # Ok::<(), datafusion_common::DataFusionError>(())
78+
//! ```
79+
//!
80+
//! [`Vortex`]: https://docs.rs/crate/vortex/latest
81+
//! [DataFusion]: https://docs.rs/datafusion/latest/datafusion/
82+
//! [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
83+
//! [`DataSourceRef`]: vortex::scan::DataSourceRef
84+
//! [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionContext.html
585
#![deny(missing_docs)]
686
use std::fmt::Debug;
787

@@ -18,7 +98,11 @@ mod tests;
1898
pub use convert::exprs::ExpressionConvertor;
1999
pub use persistent::*;
20100

21-
/// Extension trait to convert our [`Precision`](vortex::stats::Precision) to Datafusion's [`Precision`](datafusion_common::stats::Precision)
101+
/// Extension trait to convert our [`Precision`] to DataFusion's
102+
/// [`DataFusionPrecision`].
103+
///
104+
/// [`Precision`]: vortex::expr::stats::Precision
105+
/// [`DataFusionPrecision`]: datafusion_common::stats::Precision
22106
trait PrecisionExt<T>
23107
where
24108
T: Debug + Clone + PartialEq + Eq + PartialOrd,

vortex-datafusion/src/persistent/access_plan.rs

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,46 @@
44
use vortex::layout::scan::scan_builder::ScanBuilder;
55
use vortex::scan::selection::Selection;
66

7-
/// Custom Vortex-specific information that can be provided by external indexes or other sources.
7+
/// Additional Vortex-specific scan constraints attached to a
8+
/// [`PartitionedFile`].
89
///
9-
/// This is intended as a low-level interface for users building their own data systems, see the [advance index] example from the DataFusion repo for a similar usage with Parquet.
10+
/// `VortexAccessPlan` is the hook to use when an external index or planner
11+
/// already knows that only part of a file needs to be scanned. The plan is
12+
/// attached as `extensions` on `PartitionedFile`, and the internal
13+
/// `VortexOpener` applies it before building the Vortex scan.
1014
///
11-
/// [advance index]: https://github.com/apache/datafusion/blob/47df535d2cd5aac5ad5a92bdc837f38e05ea0f0f/datafusion-examples/examples/data_io/parquet_advanced_index.rs
15+
/// The current access plan surface is intentionally small: it lets callers
16+
/// provide a [`Selection`] that narrows the rows considered by the scan.
17+
///
18+
/// # Example
19+
///
20+
/// ```no_run
21+
/// # use std::sync::Arc;
22+
/// # use datafusion_datasource::PartitionedFile;
23+
/// # use vortex::scan::selection::Selection;
24+
/// use vortex_datafusion::VortexAccessPlan;
25+
///
26+
/// # let selection: Selection = todo!();
27+
/// let file = PartitionedFile::new("metrics.vortex", 1024).with_extensions(Arc::new(
28+
/// VortexAccessPlan::default().with_selection(selection),
29+
/// ));
30+
/// # let _ = file;
31+
/// ```
32+
///
33+
/// This is a low-level integration point for systems building their own access
34+
/// paths on top of DataFusion. For a conceptually similar Parquet example, see
35+
/// DataFusion's
36+
/// [`parquet_advanced_index`].
37+
///
38+
/// [`PartitionedFile`]: datafusion_datasource::PartitionedFile
39+
/// [`parquet_advanced_index`]: https://github.com/apache/datafusion/blob/47df535d2cd5aac5ad5a92bdc837f38e05ea0f0f/datafusion-examples/examples/data_io/parquet_advanced_index.rs
1240
#[derive(Default)]
1341
pub struct VortexAccessPlan {
1442
selection: Option<Selection>,
1543
}
1644

1745
impl VortexAccessPlan {
18-
/// Sets a [`Selection`] for this plan.
46+
/// Sets the row [`Selection`] to apply when the file is opened.
1947
pub fn with_selection(mut self, selection: Selection) -> Self {
2048
self.selection = Some(selection);
2149
self
@@ -28,7 +56,10 @@ impl VortexAccessPlan {
2856
self.selection.as_ref()
2957
}
3058

31-
/// Apply the plan to the scan's builder.
59+
/// Applies this access plan to a [`ScanBuilder`].
60+
///
61+
/// This is used internally by the file opener after it has translated a
62+
/// `PartitionedFile` into a Vortex scan.
3263
pub fn apply_to_builder<A>(&self, mut scan_builder: ScanBuilder<A>) -> ScanBuilder<A>
3364
where
3465
A: 'static + Send,

vortex-datafusion/src/persistent/format.rs

Lines changed: 127 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,53 @@ use crate::convert::TryToDataFusion;
7070

7171
const DEFAULT_FOOTER_INITIAL_READ_SIZE_BYTES: usize = MAX_POSTSCRIPT_SIZE as usize + EOF_SIZE;
7272

73-
/// Vortex implementation of a DataFusion [`FileFormat`].
73+
/// DataFusion [`FileFormat`] implementation for `.vortex` files.
74+
///
75+
/// Most applications do not construct `VortexFormat` directly. Instead, they
76+
/// register [`VortexFormatFactory`] with a [`SessionContext`] and let
77+
/// DataFusion instantiate `VortexFormat` as tables are planned.
78+
///
79+
/// Construct `VortexFormat` directly when you are wiring a [`ListingTable`] by
80+
/// hand and need to pass a file format into [`ListingOptions`].
81+
///
82+
/// # Example
83+
///
84+
/// ```no_run
85+
/// use std::sync::Arc;
86+
///
87+
/// use datafusion::datasource::listing::ListingOptions;
88+
/// use datafusion::datasource::listing::ListingTable;
89+
/// use datafusion::datasource::listing::ListingTableConfig;
90+
/// use datafusion::datasource::listing::ListingTableUrl;
91+
/// use datafusion::prelude::SessionContext;
92+
/// use tempfile::tempdir;
93+
/// use vortex::VortexSessionDefault;
94+
/// use vortex::session::VortexSession;
95+
/// use vortex_datafusion::VortexFormat;
96+
///
97+
/// # #[tokio::main]
98+
/// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
99+
/// let ctx = SessionContext::new();
100+
/// let dir = tempdir()?;
101+
///
102+
/// let format = Arc::new(VortexFormat::new(VortexSession::default()));
103+
/// let table_url = ListingTableUrl::parse(dir.path().to_str().unwrap())?;
104+
/// let config = ListingTableConfig::new(table_url)
105+
/// .with_listing_options(
106+
/// ListingOptions::new(format).with_session_config_options(ctx.state().config()),
107+
/// )
108+
/// .infer_schema(&ctx.state())
109+
/// .await?;
110+
///
111+
/// let table = ListingTable::try_new(config)?;
112+
/// # let _ = table;
113+
/// # Ok(())
114+
/// # }
115+
/// ```
116+
///
117+
/// [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionContext.html
118+
/// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
119+
/// [`ListingOptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingOptions.html
74120
pub struct VortexFormat {
75121
session: VortexSession,
76122
opts: VortexTableOptions,
@@ -85,9 +131,24 @@ impl Debug for VortexFormat {
85131
}
86132

87133
config_namespace! {
88-
/// Options to configure the [`VortexFormat`].
134+
/// Options to configure [`VortexFormat`] and [`VortexSource`].
89135
///
90-
/// Can be set through a DataFusion [`SessionConfig`].
136+
/// These options are usually set on a [`VortexFormatFactory`] and inherited
137+
/// by the `VortexFormat` / `VortexSource` instances created for individual
138+
/// tables.
139+
///
140+
/// # Example
141+
///
142+
/// ```rust
143+
/// use vortex_datafusion::{VortexFormatFactory, VortexTableOptions};
144+
///
145+
/// let factory = VortexFormatFactory::new().with_options(VortexTableOptions {
146+
/// projection_pushdown: true,
147+
/// scan_concurrency: Some(8),
148+
/// ..Default::default()
149+
/// });
150+
/// # let _ = factory;
151+
/// ```
91152
///
92153
/// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
93154
pub struct VortexTableOptions {
@@ -113,7 +174,44 @@ config_namespace! {
113174

114175
impl Eq for VortexTableOptions {}
115176

116-
/// Minimal factory to create [`VortexFormat`] instances.
177+
/// Registration entry point for the file-backed Vortex integration.
178+
///
179+
/// `VortexFormatFactory` is the type most applications use. Register it with a
180+
/// DataFusion session, and DataFusion will create [`VortexFormat`] values for
181+
/// `CREATE EXTERNAL TABLE`, [`ListingTable`], and URL-table scans.
182+
///
183+
/// The factory stores a [`VortexSession`] and default [`VortexTableOptions`].
184+
/// Those defaults are copied into the formats and sources created for each
185+
/// table.
186+
///
187+
/// # Example
188+
///
189+
/// ```no_run
190+
/// use std::sync::Arc;
191+
///
192+
/// use datafusion::datasource::provider::DefaultTableFactory;
193+
/// use datafusion::execution::SessionStateBuilder;
194+
/// use datafusion_common::GetExt;
195+
/// use vortex_datafusion::{VortexFormatFactory, VortexTableOptions};
196+
///
197+
/// let factory = Arc::new(VortexFormatFactory::new().with_options(VortexTableOptions {
198+
/// projection_pushdown: true,
199+
/// ..Default::default()
200+
/// }));
201+
///
202+
/// let mut state_builder = SessionStateBuilder::new()
203+
/// .with_default_features()
204+
/// .with_table_factory(
205+
/// factory.get_ext().to_uppercase(),
206+
/// Arc::new(DefaultTableFactory::new()),
207+
/// );
208+
///
209+
/// if let Some(file_formats) = state_builder.file_formats() {
210+
/// file_formats.push(factory.clone() as _);
211+
/// }
212+
/// ```
213+
///
214+
/// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
117215
#[derive(Debug)]
118216
pub struct VortexFormatFactory {
119217
session: VortexSession,
@@ -127,7 +225,7 @@ impl GetExt for VortexFormatFactory {
127225
}
128226

129227
impl VortexFormatFactory {
130-
/// Creates a new instance with a default [`VortexSession`] and default options.
228+
/// Creates a factory with a default [`VortexSession`] and default options.
131229
#[expect(
132230
clippy::new_without_default,
133231
reason = "FormatFactory defines `default` method, so having `Default` implementation is confusing"
@@ -139,23 +237,33 @@ impl VortexFormatFactory {
139237
}
140238
}
141239

142-
/// Creates a new instance with customized session and default options for all [`VortexFormat`] instances created from this factory.
240+
/// Creates a factory with an explicit session and default options.
143241
///
144-
/// The options can be overridden by table-level configuration pass in [`FileFormatFactory::create`].
242+
/// The supplied options become the baseline for every [`VortexFormat`]
243+
/// created by this factory. DataFusion may still override them with
244+
/// table-level options passed into [`FileFormatFactory::create`].
145245
pub fn new_with_options(session: VortexSession, options: VortexTableOptions) -> Self {
146246
Self {
147247
session,
148248
options: Some(options),
149249
}
150250
}
151251

152-
/// Override the default options for this factory.
252+
/// Overrides the default options for this factory.
253+
///
254+
/// This is the usual way to turn on features such as projection pushdown for
255+
/// every table created through the factory.
256+
///
257+
/// # Example
153258
///
154-
/// For example:
155259
/// ```rust
156260
/// use vortex_datafusion::{VortexFormatFactory, VortexTableOptions};
157261
///
158-
/// let factory = VortexFormatFactory::new().with_options(VortexTableOptions::default());
262+
/// let factory = VortexFormatFactory::new().with_options(VortexTableOptions {
263+
/// projection_pushdown: true,
264+
/// ..Default::default()
265+
/// });
266+
/// # let _ = factory;
159267
/// ```
160268
pub fn with_options(mut self, options: VortexTableOptions) -> Self {
161269
self.options = Some(options);
@@ -195,17 +303,23 @@ impl FileFormatFactory for VortexFormatFactory {
195303
}
196304

197305
impl VortexFormat {
198-
/// Create a new instance with default options.
306+
/// Creates a format with default [`VortexTableOptions`].
307+
///
308+
/// Prefer [`VortexFormatFactory`] when registering with a session. Construct
309+
/// `VortexFormat` directly when building [`ListingOptions`] manually.
310+
///
311+
/// [`ListingOptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingOptions.html
199312
pub fn new(session: VortexSession) -> Self {
200313
Self::new_with_options(session, VortexTableOptions::default())
201314
}
202315

203-
/// Creates a new instance with configured by a [`VortexTableOptions`].
316+
/// Creates a format with explicit [`VortexTableOptions`].
204317
pub fn new_with_options(session: VortexSession, opts: VortexTableOptions) -> Self {
205318
Self { session, opts }
206319
}
207320

208-
/// Return the format specific configuration
321+
/// Returns the format-specific configuration that will be copied into the
322+
/// [`VortexSource`] created for a scan.
209323
pub fn options(&self) -> &VortexTableOptions {
210324
&self.opts
211325
}

0 commit comments

Comments
 (0)