Skip to content

Commit 5197fa9

Browse files
feat: integration s3 with arrow filesystem
1 parent e7f1d0f commit 5197fa9

File tree

11 files changed

+787
-0
lines changed

11 files changed

+787
-0
lines changed

.github/workflows/test.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,22 @@ jobs:
4343
timeout-minutes: 30
4444
strategy:
4545
fail-fast: false
46+
env:
47+
ICEBERG_TEST_S3_URI: s3://iceberg-test
48+
AWS_ACCESS_KEY_ID: minio
49+
AWS_SECRET_ACCESS_KEY: minio123
50+
AWS_DEFAULT_REGION: us-east-1
51+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
52+
AWS_EC2_METADATA_DISABLED: "TRUE"
4653
steps:
4754
- name: Checkout iceberg-cpp
4855
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
4956
- name: Install dependencies
5057
shell: bash
5158
run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev
59+
- name: Start MinIO
60+
shell: bash
61+
run: bash ci/scripts/start_minio.sh
5262
- name: Build Iceberg
5363
shell: bash
5464
env:
@@ -67,9 +77,19 @@ jobs:
6777
timeout-minutes: 30
6878
strategy:
6979
fail-fast: false
80+
env:
81+
ICEBERG_TEST_S3_URI: s3://iceberg-test
82+
AWS_ACCESS_KEY_ID: minio
83+
AWS_SECRET_ACCESS_KEY: minio123
84+
AWS_DEFAULT_REGION: us-east-1
85+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
86+
AWS_EC2_METADATA_DISABLED: "TRUE"
7087
steps:
7188
- name: Checkout iceberg-cpp
7289
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
90+
- name: Start MinIO
91+
shell: bash
92+
run: bash ci/scripts/start_minio.sh
7393
- name: Build Iceberg
7494
shell: bash
7595
run: ci/scripts/build_iceberg.sh $(pwd)
@@ -82,6 +102,13 @@ jobs:
82102
timeout-minutes: 60
83103
strategy:
84104
fail-fast: false
105+
env:
106+
ICEBERG_TEST_S3_URI: s3://iceberg-test
107+
AWS_ACCESS_KEY_ID: minio
108+
AWS_SECRET_ACCESS_KEY: minio123
109+
AWS_DEFAULT_REGION: us-east-1
110+
AWS_ENDPOINT_URL: http://127.0.0.1:9000
111+
AWS_EC2_METADATA_DISABLED: "TRUE"
85112
steps:
86113
- name: Checkout iceberg-cpp
87114
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -91,6 +118,9 @@ jobs:
91118
vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows
92119
- name: Setup sccache
93120
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
121+
- name: Start MinIO
122+
shell: bash
123+
run: bash ci/scripts/start_minio.sh
94124
- name: Build Iceberg
95125
shell: cmd
96126
env:

ci/scripts/start_minio.sh

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
set -eux
21+
22+
MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}"
23+
MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}"
24+
MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:RELEASE.2024-12-18T00-00-00Z}"
25+
MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}"
26+
MINIO_PORT="${MINIO_PORT:-9000}"
27+
MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}"
28+
MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}"
29+
MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}"
30+
31+
wait_for_minio() {
32+
for i in {1..30}; do
33+
if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then
34+
return 0
35+
fi
36+
sleep 1
37+
done
38+
return 1
39+
}
40+
41+
start_minio_docker() {
42+
if ! command -v docker >/dev/null 2>&1; then
43+
return 1
44+
fi
45+
46+
if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then
47+
docker rm -f "${MINIO_CONTAINER_NAME}"
48+
fi
49+
50+
docker run -d --name "${MINIO_CONTAINER_NAME}" \
51+
-p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \
52+
-e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \
53+
-e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \
54+
"${MINIO_IMAGE}" \
55+
server /data --console-address ":${MINIO_CONSOLE_PORT}"
56+
57+
wait_for_minio
58+
}
59+
60+
start_minio_macos() {
61+
if ! command -v brew >/dev/null 2>&1; then
62+
echo "brew is required to start MinIO on macOS without Docker" >&2
63+
return 1
64+
fi
65+
66+
brew install minio
67+
minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" &
68+
wait_for_minio
69+
}
70+
71+
download_mc() {
72+
local uname_out
73+
uname_out="$(uname -s)"
74+
75+
local mc_dir
76+
mc_dir="${RUNNER_TEMP:-/tmp}"
77+
mkdir -p "${mc_dir}"
78+
79+
case "${uname_out}" in
80+
Linux*)
81+
MC_BIN="${mc_dir}/mc"
82+
curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}"
83+
chmod +x "${MC_BIN}"
84+
;;
85+
Darwin*)
86+
MC_BIN="${mc_dir}/mc"
87+
curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}"
88+
chmod +x "${MC_BIN}"
89+
;;
90+
MINGW*|MSYS*|CYGWIN*)
91+
MC_BIN="${mc_dir}/mc.exe"
92+
curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}"
93+
;;
94+
*)
95+
echo "Unsupported OS for mc: ${uname_out}" >&2
96+
return 1
97+
;;
98+
esac
99+
}
100+
101+
create_bucket() {
102+
download_mc
103+
for i in {1..30}; do
104+
if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then
105+
break
106+
fi
107+
sleep 1
108+
done
109+
"${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}"
110+
}
111+
112+
case "$(uname -s)" in
113+
Darwin*)
114+
if ! start_minio_docker; then
115+
start_minio_macos
116+
fi
117+
;;
118+
Linux*|MINGW*|MSYS*|CYGWIN*)
119+
start_minio_docker
120+
;;
121+
*)
122+
echo "Unsupported OS: $(uname -s)" >&2
123+
exit 1
124+
;;
125+
esac
126+
127+
create_bucket

cmake_modules/IcebergThirdpartyToolchain.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ function(resolve_arrow_dependency)
8787
# Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*)
8888
set(ARROW_IPC ON)
8989
set(ARROW_FILESYSTEM ON)
90+
set(ARROW_S3 ON)
9091
set(ARROW_JSON ON)
9192
set(ARROW_PARQUET ON)
9293
set(ARROW_SIMD_LEVEL "NONE")

src/iceberg/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ set(ICEBERG_SOURCES
3939
expression/rewrite_not.cc
4040
expression/strict_metrics_evaluator.cc
4141
expression/term.cc
42+
file_io_registry.cc
4243
file_reader.cc
4344
file_writer.cc
4445
inheritable_metadata.cc
@@ -171,6 +172,8 @@ add_subdirectory(util)
171172
if(ICEBERG_BUILD_BUNDLE)
172173
set(ICEBERG_BUNDLE_SOURCES
173174
arrow/arrow_fs_file_io.cc
175+
arrow/arrow_s3_file_io.cc
176+
arrow/file_io_register.cc
174177
arrow/metadata_column_util.cc
175178
avro/avro_data_util.cc
176179
avro/avro_direct_decoder.cc

src/iceberg/arrow/arrow_file_io.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,31 @@
2020
#pragma once
2121

2222
#include <memory>
23+
#include <string>
24+
#include <unordered_map>
2325

2426
#include "iceberg/file_io.h"
2527
#include "iceberg/iceberg_bundle_export.h"
28+
#include "iceberg/result.h"
2629

2730
namespace iceberg::arrow {
2831

2932
ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeMockFileIO();
3033

3134
ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeLocalFileIO();
3235

36+
/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem.
37+
///
38+
/// This function initializes the S3 subsystem if not already initialized (thread-safe).
39+
/// The S3 initialization is done once per process using std::call_once.
40+
///
41+
/// \param uri An S3 URI (must start with "s3://") used to validate the scheme.
42+
/// \param properties Optional configuration properties for S3 access. See S3Properties
43+
/// for available keys (credentials, region, endpoint, timeouts, etc.).
44+
/// \return A FileIO instance for S3 operations, or an error if S3 is not supported
45+
/// or the URI is invalid.
46+
ICEBERG_BUNDLE_EXPORT Result<std::unique_ptr<FileIO>> MakeS3FileIO(
47+
const std::string& uri,
48+
const std::unordered_map<std::string, std::string>& properties = {});
49+
3350
} // namespace iceberg::arrow

0 commit comments

Comments
 (0)