Skip to content

Commit 94a836d

Browse files
committed
Address PR review feedback
- Change catalog name from "iceberg" to "my_catalog" to clarify it has no special meaning - Move external table code to separate example file - Remove sections not suited for end-users: External Tables, Table Provider Types, Creating Partitioned Tables (Rust API), Query Optimization - Add clarification that table properties must be set via Iceberg catalog API
1 parent 4830a74 commit 94a836d

4 files changed

Lines changed: 94 additions & 110 deletions

File tree

crates/examples/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ required-features = ["storage-oss"]
5050
name = "datafusion-integration"
5151
path = "src/datafusion_integration.rs"
5252

53+
[[example]]
54+
name = "datafusion-external-table"
55+
path = "src/datafusion_external_table.rs"
56+
5357
[features]
5458
default = []
5559
storage-oss = ["iceberg/storage-oss"]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Example demonstrating external Iceberg table access via DataFusion.
19+
//!
20+
//! This example shows how to use `IcebergTableProviderFactory` to read
21+
//! existing Iceberg tables via `CREATE EXTERNAL TABLE` syntax.
22+
//!
23+
//! Note: External tables are read-only. For write operations, use
24+
//! `IcebergCatalogProvider` instead (see `datafusion_integration.rs`).
25+
26+
use std::sync::Arc;
27+
28+
use datafusion::execution::context::SessionContext;
29+
use datafusion::execution::session_state::SessionStateBuilder;
30+
use iceberg_datafusion::IcebergTableProviderFactory;
31+
32+
// ANCHOR: external_table_setup
33+
/// Set up a DataFusion session with IcebergTableProviderFactory registered.
34+
///
35+
/// This allows reading existing Iceberg tables via `CREATE EXTERNAL TABLE` syntax.
36+
fn setup_external_table_support() -> SessionContext {
37+
// Create a session state with the Iceberg table factory registered
38+
let mut state = SessionStateBuilder::new().with_default_features().build();
39+
40+
// Register the IcebergTableProviderFactory to handle "ICEBERG" file type
41+
state.table_factories_mut().insert(
42+
"ICEBERG".to_string(),
43+
Arc::new(IcebergTableProviderFactory::new()),
44+
);
45+
46+
SessionContext::new_with_state(state)
47+
}
48+
// ANCHOR_END: external_table_setup
49+
50+
#[tokio::main]
51+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
52+
let ctx = setup_external_table_support();
53+
54+
// ANCHOR: external_table_query
55+
// Example SQL for creating and querying an external Iceberg table:
56+
//
57+
// CREATE EXTERNAL TABLE my_table
58+
// STORED AS ICEBERG
59+
// LOCATION '/path/to/iceberg/metadata/v1.metadata.json';
60+
//
61+
// SELECT * FROM my_table WHERE column > 100;
62+
// ANCHOR_END: external_table_query
63+
64+
println!("External table support configured.");
65+
println!("Use CREATE EXTERNAL TABLE ... STORED AS ICEBERG to read existing tables.");
66+
println!();
67+
println!("Example:");
68+
println!(" CREATE EXTERNAL TABLE my_table");
69+
println!(" STORED AS ICEBERG");
70+
println!(" LOCATION '/path/to/iceberg/metadata/v1.metadata.json';");
71+
72+
// This example requires an actual Iceberg table to query.
73+
// For a complete working example with table creation, see datafusion_integration.rs
74+
75+
// Verify the session is configured correctly
76+
let tables = ctx.catalog_names();
77+
println!("\nRegistered catalogs: {tables:?}");
78+
79+
Ok(())
80+
}

crates/examples/src/datafusion_integration.rs

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ use std::collections::HashMap;
2727
use std::sync::Arc;
2828

2929
use datafusion::execution::context::SessionContext;
30-
use datafusion::execution::session_state::SessionStateBuilder;
3130
use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder};
3231
use iceberg::{Catalog, CatalogBuilder, NamespaceIdent};
33-
use iceberg_datafusion::{IcebergCatalogProvider, IcebergTableProviderFactory};
32+
use iceberg_datafusion::IcebergCatalogProvider;
3433
use tempfile::TempDir;
3534

3635
#[tokio::main]
@@ -59,13 +58,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
5958
Arc::new(IcebergCatalogProvider::try_new(Arc::new(iceberg_catalog)).await?);
6059

6160
let ctx = SessionContext::new();
62-
ctx.register_catalog("iceberg", catalog_provider);
61+
ctx.register_catalog("my_catalog", catalog_provider);
6362
// ANCHOR_END: catalog_setup
6463

6564
// ANCHOR: create_table
6665
// Create a table using SQL
6766
ctx.sql(
68-
"CREATE TABLE iceberg.demo.users (
67+
"CREATE TABLE my_catalog.demo.users (
6968
id INT NOT NULL,
7069
name STRING NOT NULL,
7170
email STRING
@@ -80,7 +79,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
8079
// Insert data into the table
8180
let result = ctx
8281
.sql(
83-
"INSERT INTO iceberg.demo.users VALUES
82+
"INSERT INTO my_catalog.demo.users VALUES
8483
(1, 'Alice', 'alice@example.com'),
8584
(2, 'Bob', 'bob@example.com'),
8685
(3, 'Charlie', NULL)",
@@ -97,15 +96,15 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
9796
// Query the data with filtering
9897
println!("\nQuerying users with email:");
9998
let df = ctx
100-
.sql("SELECT id, name, email FROM iceberg.demo.users WHERE email IS NOT NULL")
99+
.sql("SELECT id, name, email FROM my_catalog.demo.users WHERE email IS NOT NULL")
101100
.await?;
102101

103102
df.show().await?;
104103

105104
// Query with projection (only specific columns)
106105
println!("\nQuerying only names:");
107106
let df = ctx
108-
.sql("SELECT name FROM iceberg.demo.users ORDER BY id")
107+
.sql("SELECT name FROM my_catalog.demo.users ORDER BY id")
109108
.await?;
110109

111110
df.show().await?;
@@ -115,15 +114,15 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
115114
// Query the snapshots metadata table
116115
println!("\nTable snapshots:");
117116
let df = ctx
118-
.sql("SELECT snapshot_id, operation FROM iceberg.demo.users$snapshots")
117+
.sql("SELECT snapshot_id, operation FROM my_catalog.demo.users$snapshots")
119118
.await?;
120119

121120
df.show().await?;
122121

123122
// Query the manifests metadata table
124123
println!("\nTable manifests:");
125124
let df = ctx
126-
.sql("SELECT path, added_data_files_count FROM iceberg.demo.users$manifests")
125+
.sql("SELECT path, added_data_files_count FROM my_catalog.demo.users$manifests")
127126
.await?;
128127

129128
df.show().await?;
@@ -133,38 +132,3 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
133132

134133
Ok(())
135134
}
136-
137-
// ANCHOR: external_table_setup
138-
/// Example of setting up IcebergTableProviderFactory for external tables.
139-
///
140-
/// This allows reading existing Iceberg tables via `CREATE EXTERNAL TABLE` syntax.
141-
#[allow(dead_code)]
142-
async fn setup_external_table_support() -> SessionContext {
143-
// Create a session state with the Iceberg table factory registered
144-
let mut state = SessionStateBuilder::new().with_default_features().build();
145-
146-
// Register the IcebergTableProviderFactory to handle "ICEBERG" file type
147-
state.table_factories_mut().insert(
148-
"ICEBERG".to_string(),
149-
Arc::new(IcebergTableProviderFactory::new()),
150-
);
151-
152-
SessionContext::new_with_state(state)
153-
}
154-
// ANCHOR_END: external_table_setup
155-
156-
// ANCHOR: external_table_query
157-
/// Example SQL for creating and querying an external Iceberg table.
158-
///
159-
/// ```sql
160-
/// -- Create an external table from an existing Iceberg metadata file
161-
/// CREATE EXTERNAL TABLE my_table
162-
/// STORED AS ICEBERG
163-
/// LOCATION '/path/to/iceberg/metadata/v1.metadata.json';
164-
///
165-
/// -- Query the external table
166-
/// SELECT * FROM my_table WHERE column > 100;
167-
/// ```
168-
#[allow(dead_code)]
169-
fn external_table_sql_example() {}
170-
// ANCHOR_END: external_table_query

website/src/datafusion.md

Lines changed: 2 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ The `iceberg-datafusion` crate provides integration between Apache Iceberg and [
2424
## Features
2525

2626
- **SQL DDL/DML**: `CREATE TABLE`, `INSERT INTO`, `SELECT`
27-
- **Query Optimization**: Projection, filter, and LIMIT pushdown
2827
- **Metadata Tables**: Query snapshots and manifests
2928
- **Partitioned Tables**: Automatic partition routing for writes
3029

@@ -100,63 +99,8 @@ Available metadata tables:
10099
- `table$snapshots` - Table snapshot history
101100
- `table$manifests` - Manifest file information
102101

103-
## File-Based Access (External Tables)
104-
105-
For reading existing Iceberg tables without a catalog, use `IcebergTableProviderFactory`:
106-
107-
```rust,no_run,noplayground
108-
{{#rustdoc_include ../../crates/examples/src/datafusion_integration.rs:external_table_setup}}
109-
```
110-
111-
Then create external tables via SQL:
112-
113-
```sql
114-
CREATE EXTERNAL TABLE my_table
115-
STORED AS ICEBERG
116-
LOCATION '/path/to/iceberg/metadata/v1.metadata.json';
117-
118-
SELECT * FROM my_table;
119-
```
120-
121-
> **Note**: External tables are read-only. For write operations, use `IcebergCatalogProvider`.
122-
123-
## Table Provider Types
124-
125-
### IcebergTableProvider
126-
127-
- Backed by an Iceberg catalog
128-
- Automatically refreshes metadata on each operation
129-
- Supports both read and write operations
130-
- Use when you need the latest table state or write capability
131-
132-
### IcebergStaticTableProvider
133-
134-
- Fixed table snapshot at construction time
135-
- No catalog round-trips (better performance)
136-
- Read-only
137-
- Use for time-travel queries or when consistency within a query is important
138-
139102
## Partitioned Tables
140103

141-
### Creating Partitioned Tables
142-
143-
Partitioned tables must be created using the Iceberg catalog API (not SQL):
144-
145-
```rust,no_run
146-
use iceberg::spec::{Transform, UnboundPartitionSpec};
147-
148-
let partition_spec = UnboundPartitionSpec::builder()
149-
.with_spec_id(0)
150-
.add_partition_field(column_id, "partition_column", Transform::Identity)?
151-
.build();
152-
```
153-
154-
Supported partition transforms:
155-
- `Identity` - Partition by exact value
156-
- `Year`, `Month`, `Day`, `Hour` - Time-based partitioning
157-
- `Bucket(n)` - Hash partitioning into n buckets
158-
- `Truncate(width)` - String/number truncation
159-
160104
### Writing to Partitioned Tables
161105

162106
When inserting into a partitioned table, data is automatically routed to the correct partition directories:
@@ -184,18 +128,10 @@ Configure via table property:
184128
write.datafusion.fanout.enabled = true
185129
```
186130

187-
## Query Optimization
188-
189-
The DataFusion integration supports several query optimizations:
190-
191-
- **Projection Pushdown**: Only reads columns referenced in the query
192-
- **Filter Pushdown**: Prunes data files using manifest statistics
193-
- **LIMIT Pushdown**: Reduces the amount of data scanned
194-
195-
These optimizations are applied automatically by the query planner.
196-
197131
## Configuration Options
198132

133+
These table properties control write behavior. They must be set when creating the table via the Iceberg catalog API, as DataFusion SQL does not support `ALTER TABLE` for property changes.
134+
199135
| Property | Default | Description |
200136
|----------|---------|-------------|
201137
| `write.datafusion.fanout.enabled` | `true` | Use FanoutWriter (true) or ClusteredWriter (false) for partitioned writes |

0 commit comments

Comments
 (0)