|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +#include "iceberg/update/fast_append.h" |
| 21 | + |
| 22 | +#include <format> |
| 23 | + |
| 24 | +#include <gmock/gmock.h> |
| 25 | +#include <gtest/gtest.h> |
| 26 | + |
| 27 | +#include "iceberg/avro/avro_register.h" |
| 28 | +#include "iceberg/partition_spec.h" |
| 29 | +#include "iceberg/schema.h" |
| 30 | +#include "iceberg/table_metadata.h" |
| 31 | +#include "iceberg/test/matchers.h" |
| 32 | +#include "iceberg/test/test_resource.h" |
| 33 | +#include "iceberg/test/update_test_base.h" |
| 34 | +#include "iceberg/util/uuid.h" |
| 35 | + |
| 36 | +namespace iceberg { |
| 37 | + |
| 38 | +class FastAppendTest : public UpdateTestBase { |
| 39 | + protected: |
| 40 | + static void SetUpTestSuite() { avro::RegisterAll(); } |
| 41 | + |
| 42 | + void SetUp() override { |
| 43 | + UpdateTestBase::SetUp(); |
| 44 | + |
| 45 | + ASSERT_THAT(catalog_->DropTable(table_ident_, /*purge=*/false), IsOk()); |
| 46 | + |
| 47 | + auto metadata_location = std::format("{}/metadata/00001-{}.metadata.json", |
| 48 | + table_location_, Uuid::GenerateV7().ToString()); |
| 49 | + ICEBERG_UNWRAP_OR_FAIL( |
| 50 | + auto metadata, ReadTableMetadataFromResource("TableMetadataV2ValidMinimal.json")); |
| 51 | + metadata->location = table_location_; |
| 52 | + ASSERT_THAT(TableMetadataUtil::Write(*file_io_, metadata_location, *metadata), |
| 53 | + IsOk()); |
| 54 | + ICEBERG_UNWRAP_OR_FAIL(table_, |
| 55 | + catalog_->RegisterTable(table_ident_, metadata_location)); |
| 56 | + |
| 57 | + // Get partition spec and schema from the base table |
| 58 | + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); |
| 59 | + ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema()); |
| 60 | + |
| 61 | + // Create test data files |
| 62 | + file_a_ = CreateDataFile("/data/file_a.parquet", 100, 1024); |
| 63 | + file_b_ = CreateDataFile("/data/file_b.parquet", 200, 2048); |
| 64 | + } |
| 65 | + |
| 66 | + std::shared_ptr<DataFile> CreateDataFile(const std::string& path, int64_t record_count, |
| 67 | + int64_t size, int64_t partition_value = 0) { |
| 68 | + auto data_file = std::make_shared<DataFile>(); |
| 69 | + data_file->content = DataFile::Content::kData; |
| 70 | + data_file->file_path = table_location_ + path; |
| 71 | + data_file->file_format = FileFormatType::kParquet; |
| 72 | + // The base table has partition spec with identity(x), so we need 1 partition value |
| 73 | + data_file->partition = |
| 74 | + PartitionValues(std::vector<Literal>{Literal::Long(partition_value)}); |
| 75 | + data_file->file_size_in_bytes = size; |
| 76 | + data_file->record_count = record_count; |
| 77 | + data_file->partition_spec_id = spec_->spec_id(); |
| 78 | + return data_file; |
| 79 | + } |
| 80 | + |
| 81 | + std::shared_ptr<PartitionSpec> spec_; |
| 82 | + std::shared_ptr<Schema> schema_; |
| 83 | + std::shared_ptr<DataFile> file_a_; |
| 84 | + std::shared_ptr<DataFile> file_b_; |
| 85 | +}; |
| 86 | + |
| 87 | +TEST_F(FastAppendTest, AppendDataFile) { |
| 88 | + std::shared_ptr<FastAppend> fast_append; |
| 89 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 90 | + fast_append->AppendFile(file_a_); |
| 91 | + |
| 92 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 93 | + |
| 94 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 95 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 96 | + EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); |
| 97 | + EXPECT_EQ(snapshot->summary.at("added-records"), "100"); |
| 98 | + EXPECT_EQ(snapshot->summary.at("added-files-size"), "1024"); |
| 99 | +} |
| 100 | + |
| 101 | +TEST_F(FastAppendTest, AppendMultipleDataFiles) { |
| 102 | + std::shared_ptr<FastAppend> fast_append; |
| 103 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 104 | + fast_append->AppendFile(file_a_); |
| 105 | + fast_append->AppendFile(file_b_); |
| 106 | + |
| 107 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 108 | + |
| 109 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 110 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 111 | + EXPECT_EQ(snapshot->summary.at("added-data-files"), "2"); |
| 112 | + EXPECT_EQ(snapshot->summary.at("added-records"), "300"); |
| 113 | + EXPECT_EQ(snapshot->summary.at("added-files-size"), "3072"); |
| 114 | +} |
| 115 | + |
| 116 | +TEST_F(FastAppendTest, AppendManyFiles) { |
| 117 | + std::shared_ptr<FastAppend> fast_append; |
| 118 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 119 | + |
| 120 | + int64_t total_records = 0; |
| 121 | + int64_t total_size = 0; |
| 122 | + constexpr int kFileCount = 10; |
| 123 | + for (int index = 0; index < kFileCount; ++index) { |
| 124 | + auto data_file = CreateDataFile(std::format("/data/file_{}.parquet", index), |
| 125 | + /*record_count=*/10 + index, |
| 126 | + /*size=*/100 + index * 10, |
| 127 | + /*partition_value=*/index % 2); |
| 128 | + total_records += data_file->record_count; |
| 129 | + total_size += data_file->file_size_in_bytes; |
| 130 | + fast_append->AppendFile(std::move(data_file)); |
| 131 | + } |
| 132 | + |
| 133 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 134 | + |
| 135 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 136 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 137 | + EXPECT_EQ(snapshot->summary.at("added-data-files"), std::to_string(kFileCount)); |
| 138 | + EXPECT_EQ(snapshot->summary.at("added-records"), std::to_string(total_records)); |
| 139 | + EXPECT_EQ(snapshot->summary.at("added-files-size"), std::to_string(total_size)); |
| 140 | +} |
| 141 | + |
| 142 | +TEST_F(FastAppendTest, EmptyTableAppendUpdatesSequenceNumbers) { |
| 143 | + EXPECT_THAT(table_->current_snapshot(), HasErrorMessage("No current snapshot")); |
| 144 | + const int64_t base_sequence_number = table_->metadata()->last_sequence_number; |
| 145 | + |
| 146 | + std::shared_ptr<FastAppend> fast_append; |
| 147 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 148 | + fast_append->AppendFile(file_a_); |
| 149 | + |
| 150 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 151 | + |
| 152 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 153 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 154 | + EXPECT_EQ(snapshot->sequence_number, base_sequence_number + 1); |
| 155 | + EXPECT_EQ(table_->metadata()->last_sequence_number, base_sequence_number + 1); |
| 156 | +} |
| 157 | + |
| 158 | +TEST_F(FastAppendTest, AppendNullFile) { |
| 159 | + std::shared_ptr<FastAppend> fast_append; |
| 160 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 161 | + fast_append->AppendFile(nullptr); |
| 162 | + |
| 163 | + auto result = fast_append->Commit(); |
| 164 | + EXPECT_FALSE(result.has_value()); |
| 165 | + EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); |
| 166 | + EXPECT_THAT(table_->current_snapshot(), HasErrorMessage("No current snapshot")); |
| 167 | +} |
| 168 | + |
| 169 | +TEST_F(FastAppendTest, AppendDuplicateFile) { |
| 170 | + std::shared_ptr<FastAppend> fast_append; |
| 171 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 172 | + fast_append->AppendFile(file_a_); |
| 173 | + fast_append->AppendFile(file_a_); // Add same file twice |
| 174 | + |
| 175 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 176 | + |
| 177 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 178 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 179 | + // Should only count the file once |
| 180 | + EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); |
| 181 | + EXPECT_EQ(snapshot->summary.at("added-records"), "100"); |
| 182 | +} |
| 183 | + |
| 184 | +TEST_F(FastAppendTest, SetSnapshotProperty) { |
| 185 | + std::shared_ptr<FastAppend> fast_append; |
| 186 | + ICEBERG_UNWRAP_OR_FAIL(fast_append, table_->NewFastAppend()); |
| 187 | + fast_append->Set("custom-property", "custom-value"); |
| 188 | + fast_append->AppendFile(file_a_); |
| 189 | + |
| 190 | + EXPECT_THAT(fast_append->Commit(), IsOk()); |
| 191 | + |
| 192 | + EXPECT_THAT(table_->Refresh(), IsOk()); |
| 193 | + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); |
| 194 | + EXPECT_EQ(snapshot->summary.at("custom-property"), "custom-value"); |
| 195 | +} |
| 196 | + |
| 197 | +} // namespace iceberg |
0 commit comments