Skip to content

Commit 4830a74

Browse files
committed
docs(datafusion): add user documentation for DataFusion integration
Add comprehensive user documentation for the DataFusion integration that covers SQL-based table operations, catalog integration, and query optimization features. Changes: - Add datafusion.md documentation page with setup, SQL operations, metadata tables, partitioned tables, and configuration options - Add datafusion_integration.rs example with annotated code sections - Update SUMMARY.md to include new documentation page - Add required dependencies to examples crate Closes #2027
1 parent b05a675 commit 4830a74

5 files changed

Lines changed: 399 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/examples/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,12 @@ rust-version = { workspace = true }
2525
version = { workspace = true }
2626

2727
[dependencies]
28+
datafusion = { workspace = true }
2829
futures = { workspace = true }
2930
iceberg = { workspace = true }
3031
iceberg-catalog-rest = { workspace = true }
32+
iceberg-datafusion = { workspace = true }
33+
tempfile = { workspace = true }
3134
tokio = { workspace = true, features = ["full"] }
3235

3336
[[example]]
@@ -43,6 +46,10 @@ name = "oss-backend"
4346
path = "src/oss_backend.rs"
4447
required-features = ["storage-oss"]
4548

49+
[[example]]
50+
name = "datafusion-integration"
51+
path = "src/datafusion_integration.rs"
52+
4653
[features]
4754
default = []
4855
storage-oss = ["iceberg/storage-oss"]
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Example demonstrating DataFusion integration with Apache Iceberg.
19+
//!
20+
//! This example shows how to:
21+
//! - Set up an Iceberg catalog with DataFusion
22+
//! - Create tables using SQL
23+
//! - Insert and query data
24+
//! - Query metadata tables
25+
26+
use std::collections::HashMap;
27+
use std::sync::Arc;
28+
29+
use datafusion::execution::context::SessionContext;
30+
use datafusion::execution::session_state::SessionStateBuilder;
31+
use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder};
32+
use iceberg::{Catalog, CatalogBuilder, NamespaceIdent};
33+
use iceberg_datafusion::{IcebergCatalogProvider, IcebergTableProviderFactory};
34+
use tempfile::TempDir;
35+
36+
#[tokio::main]
37+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
38+
// Create a temporary directory for the warehouse
39+
let temp_dir = TempDir::new()?;
40+
let warehouse_path = temp_dir.path().to_str().unwrap().to_string();
41+
42+
// ANCHOR: catalog_setup
43+
// Create an in-memory Iceberg catalog
44+
let iceberg_catalog = MemoryCatalogBuilder::default()
45+
.load(
46+
"memory",
47+
HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse_path)]),
48+
)
49+
.await?;
50+
51+
// Create a namespace for our tables
52+
let namespace = NamespaceIdent::new("demo".to_string());
53+
iceberg_catalog
54+
.create_namespace(&namespace, HashMap::new())
55+
.await?;
56+
57+
// Create the IcebergCatalogProvider and register it with DataFusion
58+
let catalog_provider =
59+
Arc::new(IcebergCatalogProvider::try_new(Arc::new(iceberg_catalog)).await?);
60+
61+
let ctx = SessionContext::new();
62+
ctx.register_catalog("iceberg", catalog_provider);
63+
// ANCHOR_END: catalog_setup
64+
65+
// ANCHOR: create_table
66+
// Create a table using SQL
67+
ctx.sql(
68+
"CREATE TABLE iceberg.demo.users (
69+
id INT NOT NULL,
70+
name STRING NOT NULL,
71+
email STRING
72+
)",
73+
)
74+
.await?;
75+
76+
println!("Table 'users' created successfully.");
77+
// ANCHOR_END: create_table
78+
79+
// ANCHOR: insert_data
80+
// Insert data into the table
81+
let result = ctx
82+
.sql(
83+
"INSERT INTO iceberg.demo.users VALUES
84+
(1, 'Alice', 'alice@example.com'),
85+
(2, 'Bob', 'bob@example.com'),
86+
(3, 'Charlie', NULL)",
87+
)
88+
.await?
89+
.collect()
90+
.await?;
91+
92+
// The result contains the number of rows inserted
93+
println!("Inserted {} rows.", result[0].num_rows());
94+
// ANCHOR_END: insert_data
95+
96+
// ANCHOR: query_data
97+
// Query the data with filtering
98+
println!("\nQuerying users with email:");
99+
let df = ctx
100+
.sql("SELECT id, name, email FROM iceberg.demo.users WHERE email IS NOT NULL")
101+
.await?;
102+
103+
df.show().await?;
104+
105+
// Query with projection (only specific columns)
106+
println!("\nQuerying only names:");
107+
let df = ctx
108+
.sql("SELECT name FROM iceberg.demo.users ORDER BY id")
109+
.await?;
110+
111+
df.show().await?;
112+
// ANCHOR_END: query_data
113+
114+
// ANCHOR: metadata_tables
115+
// Query the snapshots metadata table
116+
println!("\nTable snapshots:");
117+
let df = ctx
118+
.sql("SELECT snapshot_id, operation FROM iceberg.demo.users$snapshots")
119+
.await?;
120+
121+
df.show().await?;
122+
123+
// Query the manifests metadata table
124+
println!("\nTable manifests:");
125+
let df = ctx
126+
.sql("SELECT path, added_data_files_count FROM iceberg.demo.users$manifests")
127+
.await?;
128+
129+
df.show().await?;
130+
// ANCHOR_END: metadata_tables
131+
132+
println!("\nDataFusion integration example completed successfully!");
133+
134+
Ok(())
135+
}
136+
137+
// ANCHOR: external_table_setup
138+
/// Example of setting up IcebergTableProviderFactory for external tables.
139+
///
140+
/// This allows reading existing Iceberg tables via `CREATE EXTERNAL TABLE` syntax.
141+
#[allow(dead_code)]
142+
async fn setup_external_table_support() -> SessionContext {
143+
// Create a session state with the Iceberg table factory registered
144+
let mut state = SessionStateBuilder::new().with_default_features().build();
145+
146+
// Register the IcebergTableProviderFactory to handle "ICEBERG" file type
147+
state.table_factories_mut().insert(
148+
"ICEBERG".to_string(),
149+
Arc::new(IcebergTableProviderFactory::new()),
150+
);
151+
152+
SessionContext::new_with_state(state)
153+
}
154+
// ANCHOR_END: external_table_setup
155+
156+
// ANCHOR: external_table_query
157+
/// Example SQL for creating and querying an external Iceberg table.
158+
///
159+
/// ```sql
160+
/// -- Create an external table from an existing Iceberg metadata file
161+
/// CREATE EXTERNAL TABLE my_table
162+
/// STORED AS ICEBERG
163+
/// LOCATION '/path/to/iceberg/metadata/v1.metadata.json';
164+
///
165+
/// -- Query the external table
166+
/// SELECT * FROM my_table WHERE column > 100;
167+
/// ```
168+
#[allow(dead_code)]
169+
fn external_table_sql_example() {}
170+
// ANCHOR_END: external_table_query

website/src/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
- [Install](./install.md)
2525
- [Download](./download.md)
2626
- [API](./api.md)
27+
- [DataFusion Integration](./datafusion.md)
2728

2829
# Developer Guide
2930

0 commit comments

Comments
 (0)