Skip to content

Commit ad087b6

Browse files
authored
Improve docs for DataFusion integration (#7442)
## Summary Reworking docs for the DataFusion integration, mostly just more examples. Also moved some top-level tests into a dedicated module. --------- Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 550f351 commit ad087b6

File tree

12 files changed

+964
-331
lines changed

12 files changed

+964
-331
lines changed

docs/user-guide/datafusion.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ vortex-datafusion = "<version>"
1414

1515
Register the Vortex format with a `SessionContext`:
1616

17-
:::{literalinclude} ../../vortex-datafusion/src/persistent/mod.rs
17+
:::{literalinclude} ../../vortex-datafusion/src/persistent/tests.rs
1818
:language: rust
1919
:dedent:
2020
:start-after: [setup]
@@ -27,7 +27,7 @@ Register the Vortex format with a `SessionContext`:
2727

2828
Create an external table and query it:
2929

30-
:::{literalinclude} ../../vortex-datafusion/src/persistent/mod.rs
30+
:::{literalinclude} ../../vortex-datafusion/src/persistent/tests.rs
3131
:language: rust
3232
:dedent:
3333
:start-after: [create]
@@ -49,7 +49,7 @@ You can also register a `ListingTable` directly:
4949

5050
Write query results to Vortex using `INSERT INTO`:
5151

52-
:::{literalinclude} ../../vortex-datafusion/src/persistent/mod.rs
52+
:::{literalinclude} ../../vortex-datafusion/src/persistent/tests.rs
5353
:language: rust
5454
:dedent:
5555
:start-after: [write]
@@ -63,7 +63,7 @@ partition value.
6363

6464
Filters and projections are pushed down into the Vortex scan:
6565

66-
:::{literalinclude} ../../vortex-datafusion/src/persistent/mod.rs
66+
:::{literalinclude} ../../vortex-datafusion/src/persistent/tests.rs
6767
:language: rust
6868
:dedent:
6969
:start-after: [query]

vortex-datafusion/src/lib.rs

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,87 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
//! Connectors to enable [DataFusion](https://docs.rs/datafusion/latest/datafusion/) to read [`Vortex`](https://docs.rs/crate/vortex/latest) data.
4+
//! Integrations between [`Vortex`] and [DataFusion].
5+
//!
6+
//! The crate exposes two main entry points:
7+
//!
8+
//! - [`VortexFormatFactory`] for the file-based integration used by SQL,
9+
//! `CREATE EXTERNAL TABLE`, and
10+
//! [`ListingTable`].
11+
//! - [`v2`] for direct integration from an existing Vortex
12+
//! [`DataSourceRef`].
13+
//!
14+
//! # Registering The File Format
15+
//!
16+
//! Most applications register [`VortexFormatFactory`] with a DataFusion
17+
//! [`SessionContext`] and then let DataFusion create [`VortexFormat`] and
18+
//! [`VortexSource`] instances as queries are planned:
19+
//!
20+
//! ```no_run
21+
//! use std::sync::Arc;
22+
//!
23+
//! use datafusion::datasource::provider::DefaultTableFactory;
24+
//! use datafusion::execution::SessionStateBuilder;
25+
//! use datafusion::prelude::SessionContext;
26+
//! use datafusion_common::GetExt;
27+
//! use vortex_datafusion::VortexFormatFactory;
28+
//!
29+
//! # #[tokio::main]
30+
//! # async fn main() -> Result<(), Box<dyn std::error::Error>> {
31+
//! let factory = Arc::new(VortexFormatFactory::new());
32+
//! let mut state_builder = SessionStateBuilder::new()
33+
//! .with_default_features()
34+
//! .with_table_factory(
35+
//! factory.get_ext().to_uppercase(),
36+
//! Arc::new(DefaultTableFactory::new()),
37+
//! );
38+
//!
39+
//! if let Some(file_formats) = state_builder.file_formats() {
40+
//! file_formats.push(factory.clone() as _);
41+
//! }
42+
//!
43+
//! let ctx = SessionContext::new_with_state(state_builder.build()).enable_url_table();
44+
//! ctx.sql(
45+
//! "CREATE EXTERNAL TABLE metrics (service VARCHAR, value BIGINT) \
46+
//! STORED AS vortex LOCATION 'file:///tmp/metrics/'",
47+
//! )
48+
//! .await?;
49+
//! # Ok(())
50+
//! # }
51+
//! ```
52+
//!
53+
//! # Registering An Existing Vortex Data Source
54+
//!
55+
//! If your application already has a Vortex [`DataSourceRef`], use
56+
//! [`v2::VortexTable`] to register it directly with DataFusion:
57+
//!
58+
//! ```no_run
59+
//! use std::sync::Arc;
60+
//!
61+
//! use arrow_schema::Schema;
62+
//! use datafusion::prelude::SessionContext;
63+
//! use vortex::VortexSessionDefault;
64+
//! use vortex::scan::DataSourceRef;
65+
//! use vortex::session::VortexSession;
66+
//! use vortex_datafusion::v2::VortexTable;
67+
//!
68+
//! # let data_source: DataSourceRef = todo!();
69+
//! let table = Arc::new(VortexTable::new(
70+
//! data_source,
71+
//! VortexSession::default(),
72+
//! Arc::new(Schema::empty()),
73+
//! ));
74+
//!
75+
//! let ctx = SessionContext::new();
76+
//! ctx.register_table("vortex_data", table)?;
77+
//! # Ok::<(), datafusion_common::DataFusionError>(())
78+
//! ```
79+
//!
80+
//! [`Vortex`]: https://docs.rs/crate/vortex/latest
81+
//! [DataFusion]: https://docs.rs/datafusion/latest/datafusion/
82+
//! [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
83+
//! [`DataSourceRef`]: vortex::scan::DataSourceRef
84+
//! [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionContext.html
585
#![deny(missing_docs)]
686
use std::fmt::Debug;
787

@@ -18,7 +98,11 @@ mod tests;
1898
pub use convert::exprs::ExpressionConvertor;
1999
pub use persistent::*;
20100

21-
/// Extension trait to convert our [`Precision`](vortex::stats::Precision) to Datafusion's [`Precision`](datafusion_common::stats::Precision)
101+
/// Extension trait to convert our [`Precision`] to DataFusion's
102+
/// [`DataFusionPrecision`].
103+
///
104+
/// [`Precision`]: vortex::expr::stats::Precision
105+
/// [`DataFusionPrecision`]: datafusion_common::stats::Precision
22106
trait PrecisionExt<T>
23107
where
24108
T: Debug + Clone + PartialEq + Eq + PartialOrd,

vortex-datafusion/src/persistent/access_plan.rs

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,46 @@
44
use vortex::layout::scan::scan_builder::ScanBuilder;
55
use vortex::scan::selection::Selection;
66

7-
/// Custom Vortex-specific information that can be provided by external indexes or other sources.
7+
/// Additional Vortex-specific scan constraints attached to a
8+
/// [`PartitionedFile`].
89
///
9-
/// This is intended as a low-level interface for users building their own data systems, see the [advance index] example from the DataFusion repo for a similar usage with Parquet.
10+
/// `VortexAccessPlan` is the hook to use when an external index or planner
11+
/// already knows that only part of a file needs to be scanned. The plan is
12+
/// attached as `extensions` on `PartitionedFile`, and the internal
13+
/// `VortexOpener` applies it before building the Vortex scan.
1014
///
11-
/// [advance index]: https://github.com/apache/datafusion/blob/47df535d2cd5aac5ad5a92bdc837f38e05ea0f0f/datafusion-examples/examples/data_io/parquet_advanced_index.rs
15+
/// The current access plan surface is intentionally small: it lets callers
16+
/// provide a [`Selection`] that narrows the rows considered by the scan.
17+
///
18+
/// # Example
19+
///
20+
/// ```no_run
21+
/// # use std::sync::Arc;
22+
/// # use datafusion_datasource::PartitionedFile;
23+
/// # use vortex::scan::selection::Selection;
24+
/// use vortex_datafusion::VortexAccessPlan;
25+
///
26+
/// # let selection: Selection = todo!();
27+
/// let file = PartitionedFile::new("metrics.vortex", 1024).with_extensions(Arc::new(
28+
/// VortexAccessPlan::default().with_selection(selection),
29+
/// ));
30+
/// # let _ = file;
31+
/// ```
32+
///
33+
/// This is a low-level integration point for systems building their own access
34+
/// paths on top of DataFusion. For a conceptually similar Parquet example, see
35+
/// DataFusion's
36+
/// [`parquet_advanced_index`].
37+
///
38+
/// [`PartitionedFile`]: datafusion_datasource::PartitionedFile
39+
/// [`parquet_advanced_index`]: https://github.com/apache/datafusion/blob/47df535d2cd5aac5ad5a92bdc837f38e05ea0f0f/datafusion-examples/examples/data_io/parquet_advanced_index.rs
1240
#[derive(Default)]
1341
pub struct VortexAccessPlan {
1442
selection: Option<Selection>,
1543
}
1644

1745
impl VortexAccessPlan {
18-
/// Sets a [`Selection`] for this plan.
46+
/// Sets the row [`Selection`] to apply when the file is opened.
1947
pub fn with_selection(mut self, selection: Selection) -> Self {
2048
self.selection = Some(selection);
2149
self
@@ -28,7 +56,10 @@ impl VortexAccessPlan {
2856
self.selection.as_ref()
2957
}
3058

31-
/// Apply the plan to the scan's builder.
59+
/// Applies this access plan to a [`ScanBuilder`].
60+
///
61+
/// This is used internally by the file opener after it has translated a
62+
/// `PartitionedFile` into a Vortex scan.
3263
pub fn apply_to_builder<A>(&self, mut scan_builder: ScanBuilder<A>) -> ScanBuilder<A>
3364
where
3465
A: 'static + Send,

0 commit comments

Comments
 (0)