Skip to content

Commit a5fc25f

Browse files
feat: add s3 file io integration (#548)
1 parent cda8fc4 commit a5fc25f

30 files changed

+1244
-17
lines changed

.github/workflows/test.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,24 @@ jobs:
4141
name: AMD64 Ubuntu 24.04
4242
runs-on: ubuntu-24.04
4343
timeout-minutes: 30
44+
strategy:
45+
fail-fast: false
46+
env:
47+
ICEBERG_TEST_S3_URI: s3://iceberg-test
48+
AWS_ACCESS_KEY_ID: minio
49+
AWS_SECRET_ACCESS_KEY: minio123
50+
AWS_DEFAULT_REGION: us-east-1
51+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
52+
AWS_EC2_METADATA_DISABLED: "TRUE"
4453
steps:
4554
- name: Checkout iceberg-cpp
4655
uses: actions/checkout@v6
4756
- name: Install dependencies
4857
shell: bash
4958
run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev
59+
- name: Start MinIO
60+
shell: bash
61+
run: bash ci/scripts/start_minio.sh
5062
- name: Build Iceberg
5163
shell: bash
5264
env:
@@ -63,9 +75,21 @@ jobs:
6375
name: AArch64 macOS 26
6476
runs-on: macos-26
6577
timeout-minutes: 30
78+
strategy:
79+
fail-fast: false
80+
env:
81+
ICEBERG_TEST_S3_URI: s3://iceberg-test
82+
AWS_ACCESS_KEY_ID: minio
83+
AWS_SECRET_ACCESS_KEY: minio123
84+
AWS_DEFAULT_REGION: us-east-1
85+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
86+
AWS_EC2_METADATA_DISABLED: "TRUE"
6687
steps:
6788
- name: Checkout iceberg-cpp
6889
uses: actions/checkout@v6
90+
- name: Start MinIO
91+
shell: bash
92+
run: bash ci/scripts/start_minio.sh
6993
- name: Build Iceberg
7094
shell: bash
7195
run: ci/scripts/build_iceberg.sh $(pwd)
@@ -76,6 +100,15 @@ jobs:
76100
name: AMD64 Windows 2025
77101
runs-on: windows-2025
78102
timeout-minutes: 60
103+
strategy:
104+
fail-fast: false
105+
env:
106+
ICEBERG_TEST_S3_URI: s3://iceberg-test
107+
AWS_ACCESS_KEY_ID: minio
108+
AWS_SECRET_ACCESS_KEY: minio123
109+
AWS_DEFAULT_REGION: us-east-1
110+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
111+
AWS_EC2_METADATA_DISABLED: "TRUE"
79112
steps:
80113
- name: Checkout iceberg-cpp
81114
uses: actions/checkout@v6
@@ -85,6 +118,9 @@ jobs:
85118
vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows
86119
- name: Setup sccache
87120
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
121+
- name: Start MinIO
122+
shell: bash
123+
run: bash ci/scripts/start_minio.sh
88124
- name: Build Iceberg
89125
shell: cmd
90126
env:

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON)
4545
option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON)
4646
option(ICEBERG_BUILD_REST "Build rest catalog client" ON)
4747
option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF)
48+
option(ICEBERG_S3 "Build with S3 support" OFF)
4849
option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF)
4950
option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF)
5051

@@ -68,6 +69,12 @@ if(ICEBERG_BUILD_REST_INTEGRATION_TESTS AND WIN32)
6869
message(WARNING "Cannot build rest integration test on Windows, turning it off.")
6970
endif()
7071

72+
# ICEBERG_S3 requires ICEBERG_BUILD_BUNDLE
73+
if(NOT ICEBERG_BUILD_BUNDLE AND ICEBERG_S3)
74+
set(ICEBERG_S3 OFF)
75+
message(STATUS "ICEBERG_S3 is disabled because ICEBERG_BUILD_BUNDLE is OFF")
76+
endif()
77+
7178
include(CMakeParseArguments)
7279
include(IcebergBuildUtils)
7380
include(IcebergSanitizer)

ci/scripts/build_iceberg.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ CMAKE_ARGS=(
3636
"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ICEBERG_HOME}}"
3737
"-DICEBERG_BUILD_STATIC=ON"
3838
"-DICEBERG_BUILD_SHARED=ON"
39+
"-DICEBERG_S3=ON"
3940
"-DICEBERG_BUILD_REST_INTEGRATION_TESTS=${build_rest_integration_test}"
4041
)
4142

ci/scripts/start_minio.sh

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
set -eux
21+
22+
MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}"
23+
MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}"
24+
MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}"
25+
MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}"
26+
MINIO_PORT="${MINIO_PORT:-9000}"
27+
MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}"
28+
MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}"
29+
MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}"
30+
31+
wait_for_minio() {
32+
for i in {1..30}; do
33+
if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then
34+
return 0
35+
fi
36+
sleep 1
37+
done
38+
echo "MinIO did not become ready after 30 seconds." >&2
39+
echo "Endpoint: ${MINIO_ENDPOINT}" >&2
40+
if command -v docker >/dev/null 2>&1; then
41+
docker logs "${MINIO_CONTAINER_NAME}" 2>&1 || true
42+
fi
43+
return 1
44+
}
45+
46+
start_minio_docker() {
47+
if ! command -v docker >/dev/null 2>&1; then
48+
return 1
49+
fi
50+
51+
if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then
52+
docker rm -f "${MINIO_CONTAINER_NAME}"
53+
fi
54+
55+
docker run -d --name "${MINIO_CONTAINER_NAME}" \
56+
-p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \
57+
-e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \
58+
-e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \
59+
"${MINIO_IMAGE}" \
60+
server /data --console-address ":${MINIO_CONSOLE_PORT}"
61+
62+
wait_for_minio
63+
}
64+
65+
start_minio_macos() {
66+
if ! command -v brew >/dev/null 2>&1; then
67+
echo "brew is required to start MinIO on macOS without Docker" >&2
68+
return 1
69+
fi
70+
71+
brew install minio
72+
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
73+
minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" &
74+
wait_for_minio
75+
}
76+
77+
download_mc() {
78+
local uname_out
79+
uname_out="$(uname -s)"
80+
81+
local mc_dir
82+
mc_dir="${RUNNER_TEMP:-/tmp}"
83+
mkdir -p "${mc_dir}"
84+
85+
case "${uname_out}" in
86+
Linux*)
87+
MC_BIN="${mc_dir}/mc"
88+
curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}"
89+
chmod +x "${MC_BIN}"
90+
;;
91+
Darwin*)
92+
MC_BIN="${mc_dir}/mc"
93+
local arch
94+
arch="$(uname -m)"
95+
if [ "${arch}" = "arm64" ]; then
96+
curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}"
97+
else
98+
curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}"
99+
fi
100+
chmod +x "${MC_BIN}"
101+
;;
102+
MINGW*|MSYS*|CYGWIN*)
103+
MC_BIN="${mc_dir}/mc.exe"
104+
curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}"
105+
;;
106+
*)
107+
echo "Unsupported OS for mc: ${uname_out}" >&2
108+
return 1
109+
;;
110+
esac
111+
}
112+
113+
create_bucket() {
114+
download_mc
115+
for i in {1..30}; do
116+
if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then
117+
break
118+
fi
119+
sleep 1
120+
done
121+
"${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}"
122+
}
123+
124+
start_minio_windows() {
125+
local minio_dir="${RUNNER_TEMP:-/tmp}"
126+
local minio_bin="${minio_dir}/minio.exe"
127+
curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}"
128+
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
129+
"${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" &
130+
wait_for_minio
131+
}
132+
133+
case "$(uname -s)" in
134+
Darwin*)
135+
if ! start_minio_docker; then
136+
start_minio_macos
137+
fi
138+
;;
139+
MINGW*|MSYS*|CYGWIN*)
140+
if ! start_minio_docker; then
141+
start_minio_windows
142+
fi
143+
;;
144+
Linux*)
145+
start_minio_docker
146+
;;
147+
*)
148+
echo "Unsupported OS: $(uname -s)" >&2
149+
exit 1
150+
;;
151+
esac
152+
153+
create_bucket

cmake_modules/IcebergThirdpartyToolchain.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ function(resolve_arrow_dependency)
102102
# Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*)
103103
set(ARROW_IPC ON)
104104
set(ARROW_FILESYSTEM ON)
105+
set(ARROW_S3 ${ICEBERG_S3})
105106
set(ARROW_JSON ON)
106107
set(ARROW_PARQUET ON)
107108
set(ARROW_SIMD_LEVEL "NONE")
@@ -164,6 +165,13 @@ function(resolve_arrow_dependency)
164165
install(FILES ${arrow_bundled_dependencies_location}
165166
DESTINATION ${ICEBERG_INSTALL_LIBDIR})
166167
endif()
168+
169+
# Arrow's exported static target interface may reference system libraries
170+
# (e.g. OpenSSL, CURL, ZLIB) that consumers need to find.
171+
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB)
172+
if(ARROW_S3)
173+
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL)
174+
endif()
167175
else()
168176
set(ARROW_VENDORED FALSE)
169177
find_package(Arrow CONFIG REQUIRED)

src/iceberg/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ set(ICEBERG_SOURCES
4343
expression/rewrite_not.cc
4444
expression/strict_metrics_evaluator.cc
4545
expression/term.cc
46+
file_io_registry.cc
4647
file_reader.cc
4748
file_writer.cc
4849
inheritable_metadata.cc
@@ -181,6 +182,8 @@ add_subdirectory(util)
181182
if(ICEBERG_BUILD_BUNDLE)
182183
set(ICEBERG_BUNDLE_SOURCES
183184
arrow/arrow_fs_file_io.cc
185+
arrow/s3/arrow_s3_file_io.cc
186+
arrow/file_io_register.cc
184187
arrow/metadata_column_util.cc
185188
avro/avro_data_util.cc
186189
avro/avro_direct_decoder.cc
@@ -247,6 +250,18 @@ if(ICEBERG_BUILD_BUNDLE)
247250
OUTPUTS
248251
ICEBERG_BUNDLE_LIBRARIES)
249252

253+
foreach(target iceberg_bundle_static iceberg_bundle_shared)
254+
if(TARGET ${target})
255+
if(ICEBERG_S3)
256+
target_compile_definitions(${target}
257+
PUBLIC "$<BUILD_INTERFACE:ICEBERG_S3_ENABLED=1>")
258+
else()
259+
target_compile_definitions(${target}
260+
PUBLIC "$<BUILD_INTERFACE:ICEBERG_S3_ENABLED=0>")
261+
endif()
262+
endif()
263+
endforeach()
264+
250265
add_subdirectory(arrow)
251266
add_subdirectory(avro)
252267
add_subdirectory(parquet)

src/iceberg/arrow/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@
1616
# under the License.
1717

1818
iceberg_install_all_headers(iceberg/arrow)
19+
20+
add_subdirectory(s3)

src/iceberg/arrow/arrow_file_io.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,34 @@
2020
#pragma once
2121

2222
#include <memory>
23+
#include <string>
24+
#include <unordered_map>
2325

2426
#include "iceberg/file_io.h"
2527
#include "iceberg/iceberg_bundle_export.h"
28+
#include "iceberg/result.h"
2629

2730
namespace iceberg::arrow {
2831

2932
ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeMockFileIO();
3033

3134
ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeLocalFileIO();
3235

36+
/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem.
37+
///
38+
/// This function initializes the S3 subsystem if not already initialized (thread-safe).
39+
/// The S3 initialization is cached once per process.
40+
///
41+
/// \param properties Configuration properties for S3 access. See S3Properties
42+
/// for available keys (credentials, region, endpoint, timeouts, etc.).
43+
/// \return A FileIO instance for S3 operations, or an error if S3 is not supported.
44+
ICEBERG_BUNDLE_EXPORT Result<std::unique_ptr<FileIO>> MakeS3FileIO(
45+
const std::unordered_map<std::string, std::string>& properties = {});
46+
47+
/// \brief Finalize (clean up) the Arrow S3 subsystem.
48+
///
49+
/// Must be called before process exit if S3 was initialized, otherwise Arrow's
50+
/// static destructors may cause a non-zero exit.
51+
ICEBERG_BUNDLE_EXPORT Status FinalizeS3();
52+
3353
} // namespace iceberg::arrow

0 commit comments

Comments
 (0)