diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 277459364..8f5b9c683 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -260,7 +260,10 @@ function(resolve_nanoarrow_dependency) set(NANOARROW_URL "$ENV{ICEBERG_NANOARROW_URL}") else() set(NANOARROW_URL + "https://www.apache.org/dyn/closer.lua?action=download&filename=/arrow/apache-arrow-nanoarrow-0.7.0/apache-arrow-nanoarrow-0.7.0.tar.gz" "https://dlcdn.apache.org/arrow/apache-arrow-nanoarrow-0.7.0/apache-arrow-nanoarrow-0.7.0.tar.gz" + "https://archive.apache.org/dist/arrow/apache-arrow-nanoarrow-0.7.0/apache-arrow-nanoarrow-0.7.0.tar.gz" + "https://github.com/apache/arrow-nanoarrow/releases/download/apache-arrow-nanoarrow-0.7.0/apache-arrow-nanoarrow-0.7.0.tar.gz" ) endif() diff --git a/mkdocs/docs/contributing.md b/mkdocs/docs/contributing.md new file mode 100644 index 000000000..98045c45e --- /dev/null +++ b/mkdocs/docs/contributing.md @@ -0,0 +1,190 @@ + + +# Contributing Guide + +We welcome contributions to Apache Iceberg! To learn more about contributing to Apache Iceberg, please refer to the official Iceberg contribution guidelines. These guidelines are intended as helpful suggestions to make the contribution process as seamless as possible, and are not strict rules. + +If you would like to discuss your proposed change before contributing, we encourage you to visit our Community page. There, you will find various ways to connect with the community, including Slack and our mailing lists. Alternatively, you can open a new issue directly in the GitHub repository. + +For first-time contributors, feel free to check out our good first issues for an easy way to get started. + +## Contributing to Iceberg C++ + +The Iceberg C++ Project is hosted on GitHub at [https://github.com/apache/iceberg-cpp](https://github.com/apache/iceberg-cpp). + +### Development Setup + +#### Prerequisites + +- CMake 3.25 or higher +- C++23 compliant compiler (GCC 14+, Clang 16+, MSVC 2022+) +- Git + +#### Building from Source + +Clone the repository for local development: + +```bash +git clone https://github.com/apache/iceberg-cpp.git +cd iceberg-cpp +``` + +Build the core libraries: + +```bash +cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_STATIC=ON -DICEBERG_BUILD_SHARED=ON +cmake --build build +ctest --test-dir build --output-on-failure +cmake --install build +``` + +Build with bundled dependencies: + +```bash +cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_BUNDLE=ON +cmake --build build +ctest --test-dir build --output-on-failure +cmake --install build +``` + +### Code Standards + +#### C++ Coding Standards + +We follow modern C++ best practices: + +- **C++23 Standard**: Use C++23 features where appropriate +- **Naming Conventions**: + - Classes: `PascalCase` (e.g., `TableScanBuilder`) + - Functions/Methods: `PascalCase` (e.g., `CreateNamespace`, `ExtractYear`) + - Trivial getters: `snake_case` (e.g., `name()`, `type_id()`, `is_primitive()`) + - Variables: `snake_case` (e.g., `file_io`) + - Constants: `k` prefix with `PascalCase` (e.g., `kHeaderContentType`, `kMaxPrecision`) +- **Memory Management**: Prefer smart pointers (`std::unique_ptr`, `std::shared_ptr`) +- **Error Handling**: Use `Result` types for error propagation +- **Documentation**: Use Doxygen-style comments for public APIs + +#### API Compatibility + +It is important to keep the C++ public API compatible across versions. Public methods should have no leading underscores and should not be removed without deprecation notice. + +If you want to remove a method, please add a deprecation notice: + +```cpp +[[deprecated("This method will be removed in version 2.0.0. Use new_method() instead.")]] +void old_method(); +``` + +#### Code Formatting + +We use `clang-format` for code formatting. The configuration is defined in `.clang-format` file. + +Format your code before submitting: + +```bash +clang-format -i src/**/*.{h,cc} +``` + +### Testing + +#### Running Tests + +Run all tests: + +```bash +ctest --test-dir build --output-on-failure +``` + +Run specific test: + +```bash +ctest --test-dir build -R test_name +``` + +### Linting + +Install the python package `pre-commit` and run once `pre-commit install`: + +```bash +pip install pre-commit +pre-commit install +``` + +This will setup a git pre-commit-hook that is executed on each commit and will report the linting problems. To run all hooks on all files use `pre-commit run -a`. + +### Submitting Changes + +#### Git Workflow + +1. **Fork the repository** on GitHub +2. **Create a feature branch** from `main`: + ```bash + git checkout -b feature/your-feature-name + ``` +3. **Make your changes** following the coding standards +4. **Add tests** for your changes +5. **Run tests** to ensure everything passes +6. **Commit your changes** with a clear commit message +7. **Push to your fork** and create a Pull Request + +#### Commit Message Format + +Use clear, descriptive commit messages: + +``` +feat: add support for S3 file system +fix: resolve memory leak in table reader +docs: update API documentation +test: add unit tests for schema validation +``` + +#### Pull Request Process + +1. **Create a Pull Request** with a clear description +2. **Link related issues** if applicable +3. **Ensure CI passes** - all tests must pass +4. **Request review** from maintainers +5. **Address feedback** and update the PR as needed +6. **Squash commits** if requested by reviewers + +### Community + +The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html) and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind, and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html). + +#### Getting Help + +- **Submit Issues**: [GitHub Issues](https://github.com/apache/iceberg-cpp/issues/new) for bug reports or feature requests +- **Mailing List**: [dev@iceberg.apache.org](mailto:dev@iceberg.apache.org) for discussions + - [Subscribe](mailto:dev-subscribe@iceberg.apache.org?subject=(send%20this%20email%20to%20subscribe)) + - [Unsubscribe](mailto:dev-unsubscribe@iceberg.apache.org?subject=(send%20this%20email%20to%20unsubscribe)) + - [Archives](https://lists.apache.org/list.html?dev@iceberg.apache.org) +- **Slack**: [Apache Iceberg Slack #cpp channel](https://join.slack.com/t/apache-iceberg/shared_invite/zt-1zbov3k6e-KtJfoaxp97YfX6dPz1Bk7A) + +#### Good First Issues + +New to the project? Check out our [good first issues](https://github.com/apache/iceberg-cpp/labels/good%20first%20issue) for an easy way to get started. + +### Release Process + +Releases are managed by the Apache Iceberg project maintainers. For information about the release process, please refer to the main Iceberg project documentation. + +## License + +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) diff --git a/mkdocs/docs/getting-started.md b/mkdocs/docs/getting-started.md new file mode 100644 index 000000000..682b409a3 --- /dev/null +++ b/mkdocs/docs/getting-started.md @@ -0,0 +1,94 @@ + + +# Getting Started + +## Requirements + +- CMake 3.25 or higher +- C++23 compliant compiler + +## Customizing Dependency URLs + +If you experience network issues when downloading dependencies, you can customize the download URLs using environment variables. + +The following environment variables can be set to customize dependency URLs: + +- `ICEBERG_ARROW_URL`: Apache Arrow tarball URL +- `ICEBERG_AVRO_URL`: Apache Avro tarball URL +- `ICEBERG_AVRO_GIT_URL`: Apache Avro git repository URL +- `ICEBERG_NANOARROW_URL`: Nanoarrow tarball URL +- `ICEBERG_CROARING_URL`: CRoaring tarball URL +- `ICEBERG_NLOHMANN_JSON_URL`: nlohmann-json tarball URL +- `ICEBERG_CPR_URL`: cpr tarball URL + +Example usage: + +```bash +export ICEBERG_ARROW_URL="https://your-mirror.com/apache-arrow-22.0.0.tar.gz" +cmake -S . -B build +``` + +## Build + +### Build, Run Test and Install Core Libraries + +```bash +cd iceberg-cpp +cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_STATIC=ON -DICEBERG_BUILD_SHARED=ON +cmake --build build +ctest --test-dir build --output-on-failure +cmake --install build +``` + +### Build and Install Iceberg Bundle Library + +#### Vendored Apache Arrow (default) + +```bash +cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_BUNDLE=ON +cmake --build build +ctest --test-dir build --output-on-failure +cmake --install build +``` + +#### Provided Apache Arrow + +```bash +cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DCMAKE_PREFIX_PATH=/path/to/arrow -DICEBERG_BUILD_BUNDLE=ON +cmake --build build +ctest --test-dir build --output-on-failure +cmake --install build +``` + +### Build Examples + +After installing the core libraries, you can build the examples: + +```bash +cd iceberg-cpp/example +cmake -S . -B build -G Ninja -DCMAKE_PREFIX_PATH=/path/to/install +cmake --build build +``` + +If you are using provided Apache Arrow, you need to include `/path/to/arrow` in `CMAKE_PREFIX_PATH` as below. + +```bash +cmake -S . -B build -G Ninja -DCMAKE_PREFIX_PATH="/path/to/install;/path/to/arrow" +``` diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md index 2e17cf28d..bee61e95b 100644 --- a/mkdocs/docs/index.md +++ b/mkdocs/docs/index.md @@ -17,174 +17,30 @@ ~ under the License. --> -# Contributing +# Apache Iceberg™ C++ -We welcome contributions to Apache Iceberg! To learn more about contributing to Apache Iceberg, please refer to the official Iceberg contribution guidelines. These guidelines are intended as helpful suggestions to make the contribution process as seamless as possible, and are not strict rules. +Apache Iceberg C++ is a high-performance C++ implementation of the [Apache Iceberg™](https://iceberg.apache.org/) open table format. -If you would like to discuss your proposed change before contributing, we encourage you to visit our Community page. There, you will find various ways to connect with the community, including Slack and our mailing lists. Alternatively, you can open a new issue directly in the GitHub repository. +## Key Features -For first-time contributors, feel free to check out our good first issues for an easy way to get started. +* **Native C++ Implementation**: Built with modern C++23 standards for performance and reliability. +* **Arrow Integration**: Seamless integration with Apache Arrow for efficient data processing and interchange. +* **Avro Support**: Full support for reading and writing Avro files. +* **Schema Evolution**: Full support for schema evolution (add, drop, rename, update columns). +* **Partitioning**: Support for hidden partitioning and partition evolution. +* **Cross-Platform**: Support for Linux, macOS, and Windows. -## Contributing to Iceberg C++ +## Quick Links -The Iceberg C++ Project is hosted on GitHub at [https://github.com/apache/iceberg-cpp](https://github.com/apache/iceberg-cpp). +* **[Getting Started](getting-started.md)**: Learn how to build and install the library. +* **[Releases](releases.md)**: Check out the latest releases and changelogs. +* **[API Documentation](api/index.html)**: Explore the C++ API reference. +* **[Contributing](contributing.md)**: Find out how to contribute to the project. -### Development Setup +## Community -#### Prerequisites +Apache Iceberg is an active open-source project. We welcome contributions and feedback! -- CMake 3.25 or higher -- C++23 compliant compiler (GCC 14+, Clang 16+, MSVC 2022+) -- Git - -#### Building from Source - -Clone the repository for local development: - -```bash -git clone https://github.com/apache/iceberg-cpp.git -cd iceberg-cpp -``` - -Build the core libraries: - -```bash -cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_STATIC=ON -DICEBERG_BUILD_SHARED=ON -cmake --build build -ctest --test-dir build --output-on-failure -cmake --install build -``` - -Build with bundled dependencies: - -```bash -cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=/path/to/install -DICEBERG_BUILD_BUNDLE=ON -cmake --build build -ctest --test-dir build --output-on-failure -cmake --install build -``` - -### Code Standards - -#### C++ Coding Standards - -We follow modern C++ best practices: - -- **C++23 Standard**: Use C++23 features where appropriate -- **Naming Conventions**: - - Classes: `PascalCase` (e.g., `TableScanBuilder`) - - Functions/Methods: `PascalCase` (e.g., `CreateNamespace`, `ExtractYear`) - - Trivial getters: `snake_case` (e.g., `name()`, `type_id()`, `is_primitive()`) - - Variables: `snake_case` (e.g., `file_io`) - - Constants: `k` prefix with `PascalCase` (e.g., `kHeaderContentType`, `kMaxPrecision`) -- **Memory Management**: Prefer smart pointers (`std::unique_ptr`, `std::shared_ptr`) -- **Error Handling**: Use `Result` types for error propagation -- **Documentation**: Use Doxygen-style comments for public APIs - -#### API Compatibility - -It is important to keep the C++ public API compatible across versions. Public methods should have no leading underscores and should not be removed without deprecation notice. - -If you want to remove a method, please add a deprecation notice: - -```cpp -[[deprecated("This method will be removed in version 2.0.0. Use new_method() instead.")]] -void old_method(); -``` - -#### Code Formatting - -We use `clang-format` for code formatting. The configuration is defined in `.clang-format` file. - -Format your code before submitting: - -```bash -clang-format -i src/**/*.{h,cc} -``` - -### Testing - -#### Running Tests - -Run all tests: - -```bash -ctest --test-dir build --output-on-failure -``` - -Run specific test: - -```bash -ctest --test-dir build -R test_name -``` - -### Linting - -Install the python package `pre-commit` and run once `pre-commit install`: - -```bash -pip install pre-commit -pre-commit install -``` - -This will setup a git pre-commit-hook that is executed on each commit and will report the linting problems. To run all hooks on all files use `pre-commit run -a`. - -### Submitting Changes - -#### Git Workflow - -1. **Fork the repository** on GitHub -2. **Create a feature branch** from `main`: - ```bash - git checkout -b feature/your-feature-name - ``` -3. **Make your changes** following the coding standards -4. **Add tests** for your changes -5. **Run tests** to ensure everything passes -6. **Commit your changes** with a clear commit message -7. **Push to your fork** and create a Pull Request - -#### Commit Message Format - -Use clear, descriptive commit messages: - -``` -feat: add support for S3 file system -fix: resolve memory leak in table reader -docs: update API documentation -test: add unit tests for schema validation -``` - -#### Pull Request Process - -1. **Create a Pull Request** with a clear description -2. **Link related issues** if applicable -3. **Ensure CI passes** - all tests must pass -4. **Request review** from maintainers -5. **Address feedback** and update the PR as needed -6. **Squash commits** if requested by reviewers - -### Community - -The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html) and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind, and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html). - -#### Getting Help - -- **Submit Issues**: [GitHub Issues](https://github.com/apache/iceberg-cpp/issues/new) for bug reports or feature requests -- **Mailing List**: [dev@iceberg.apache.org](mailto:dev@iceberg.apache.org) for discussions - - [Subscribe](mailto:dev-subscribe@iceberg.apache.org?subject=(send%20this%20email%20to%20subscribe)) - - [Unsubscribe](mailto:dev-unsubscribe@iceberg.apache.org?subject=(send%20this%20email%20to%20unsubscribe)) - - [Archives](https://lists.apache.org/list.html?dev@iceberg.apache.org) -- **Slack**: [Apache Iceberg Slack #cpp channel](https://join.slack.com/t/apache-iceberg/shared_invite/zt-1zbov3k6e-KtJfoaxp97YfX6dPz1Bk7A) - -#### Good First Issues - -New to the project? Check out our [good first issues](https://github.com/apache/iceberg-cpp/labels/good%20first%20issue) for an easy way to get started. - -### Release Process - -Releases are managed by the Apache Iceberg project maintainers. For information about the release process, please refer to the main Iceberg project documentation. - -## License - -Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) +* **Slack**: Join the `#cpp` channel on the [Apache Iceberg Slack workspace](https://join.slack.com/t/apache-iceberg/shared_invite/zt-1zbov3k6e-KtJfoaxp97YfX6dPz1Bk7A). +* **Mailing List**: Subscribe to the [dev mailing list](mailto:dev@iceberg.apache.org) for discussions. +* **GitHub Issues**: Report bugs or request features on [GitHub](https://github.com/apache/iceberg-cpp/issues). diff --git a/mkdocs/docs/releases.md b/mkdocs/docs/releases.md index a2e1b4d1f..cc1f7ddec 100644 --- a/mkdocs/docs/releases.md +++ b/mkdocs/docs/releases.md @@ -23,48 +23,25 @@ Apache Iceberg C++ releases are available for download from our GitHub releases ## Latest Release -### Apache Iceberg C++ 0.1.0 +### Apache Iceberg C++ 0.2.0 -**Released:** September 15, 2024 +**Released:** January 26, 2025 -**Download:** [v0.1.0](https://github.com/apache/iceberg-cpp/releases/tag/v0.1.0) +**Download:** [v0.2.0](https://github.com/apache/iceberg-cpp/releases/tag/v0.2.0) -#### What's New +#### Release Highlights -This is the first official release of Apache Iceberg C++. This release includes: - -- **Core Libraries**: Basic CMake support and iceberg library structure -- **Data Types**: Support for primitive types (int, long, string, boolean, etc.) -- **Schema Management**: Schema field definitions and schema conversion -- **Table Metadata**: Table metadata reading and writing capabilities -- **File I/O**: Local file system support using Arrow FileSystem -- **Avro Support**: Avro file reading and schema conversion -- **Arrow Integration**: Arrow C Data Interface and schema conversion -- **Partitioning**: Partition field and partition spec support -- **Sorting**: Sort order and sort field definitions -- **Expressions**: Basic expression support and literal types -- **Catalog**: In-memory catalog implementation -- **Table Scanning**: Basic table scan planning -- **Testing**: Comprehensive test suite with GoogleTest -- **Documentation**: API documentation generation with Doxygen - -#### Key Features - -- **C++23 Support**: Modern C++ features and standards -- **Cross-Platform**: Support for Linux, macOS, and Windows -- **Arrow Integration**: Seamless integration with Apache Arrow -- **Avro Compatibility**: Full Avro file format support -- **Memory Safety**: Smart pointer usage and RAII patterns -- **Error Handling**: Comprehensive error handling with Result types -- **Performance**: Optimized for high-performance data processing +- **Table Scan and Data Access**: Support for v2 deletes and metadata column reads. Enhanced ManifestReader with projection and filtering. Implemented file scan task reader with Arrow C Stream integration. +- **Table Operations**: Schema evolution (add, delete, update, and move columns). Table updates (properties, sort order, partition spec, location, and statistics). Transaction API with snapshot management (fast append). +- **REST Catalog**: Full REST Catalog client with namespace operations and table CRUD operations. Support for create, load, drop, list, update, and stage-create table operations. +- **Expression System**: Complete expression framework with literal expressions, type casting, and binary serialization. Inclusive/strict metrics evaluators, manifest evaluator, and residual evaluator. Aggregate expressions and projection evaluators. +- **Performance and I/O**: Optimized Avro reader/writer with direct encoding and multi-block support. Configurable Avro and Parquet readers/writers. +- **Catalog and Metadata**: InMemoryCatalog implementation with table management. Location provider and partition path generation. Schema selection, projection, and table metadata builder. +- **Miscellaneous**: Meson build system support. Initial documentation website and devcontainer. Improved code organization and type safety with validation. #### Installation -Download the release from GitHub and follow the installation instructions in our [Contributing guide](index.md). - -#### Breaking Changes - -This is the first release, so there are no breaking changes from previous versions. +Download the release from GitHub and follow the installation instructions in our [Getting Started guide](getting-started.md). ## All Releases @@ -74,7 +51,8 @@ For a complete list of all releases, including release notes and download links, | Version | Release Date | Status | Download | |---------|-------------|--------|----------| -| [v0.1.0](https://github.com/apache/iceberg-cpp/releases/tag/v0.1.0) | September 15, 2024 | Latest | [Download](https://github.com/apache/iceberg-cpp/releases/tag/v0.1.0) | +| [v0.2.0](https://github.com/apache/iceberg-cpp/releases/tag/v0.2.0) | January 26, 2025 | Latest | [Download](https://github.com/apache/iceberg-cpp/releases/tag/v0.2.0) | +| [v0.1.0](https://github.com/apache/iceberg-cpp/releases/tag/v0.1.0) | September 15, 2024 | Stable | [Download](https://github.com/apache/iceberg-cpp/releases/tag/v0.1.0) | ## Release Process @@ -93,11 +71,11 @@ Download the source code from our GitHub releases page: ```bash # Download latest release -wget https://github.com/apache/iceberg-cpp/archive/refs/tags/v0.1.0.tar.gz +wget https://github.com/apache/iceberg-cpp/archive/refs/tags/v0.2.0.tar.gz # Extract and build -tar -xzf v0.1.0.tar.gz -cd iceberg-cpp-0.1.0 +tar -xzf v0.2.0.tar.gz +cd iceberg-cpp-0.2.0 ``` ### Pre-built Binaries @@ -130,7 +108,7 @@ Releases are signed with the Apache Iceberg project GPG key. Verify signatures u gpg --keyserver keyserver.ubuntu.com --recv-keys B5690EEEBB952194 # Verify the signature -gpg --verify iceberg-cpp-0.1.0.tar.gz.asc iceberg-cpp-0.1.0.tar.gz +gpg --verify iceberg-cpp-0.2.0.tar.gz.asc iceberg-cpp-0.2.0.tar.gz ``` ## Support diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml index f81ca17e0..a27416773 100644 --- a/mkdocs/mkdocs.yml +++ b/mkdocs/mkdocs.yml @@ -47,8 +47,10 @@ markdown_extensions: permalink: true nav: - - Contributing: index.md + - Home: index.md + - Getting Started: getting-started.md - Releases: releases.md + - Contributing: contributing.md - API Documentation: api/index.html extra: diff --git a/src/iceberg/arrow/arrow_fs_file_io.cc b/src/iceberg/arrow/arrow_fs_file_io.cc index be62b79af..45a114082 100644 --- a/src/iceberg/arrow/arrow_fs_file_io.cc +++ b/src/iceberg/arrow/arrow_fs_file_io.cc @@ -47,6 +47,9 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca ICEBERG_ARROW_ASSIGN_OR_RETURN( auto read_bytes, file->Read(read_length, reinterpret_cast(&content[offset]))); + if (read_bytes == 0) { + return IOError("Unexpected EOF while reading file: {} bytes remain", remain); + } remain -= read_bytes; offset += read_bytes; }