Skip to content

Commit 1fad9ba

Browse files
committed
fix(update): clean up delete files from expired manifests
Reachable cleanup now reads live paths from data and delete manifests, matching Java's ManifestFiles.readPaths behavior.
1 parent 988f363 commit 1fad9ba

2 files changed

Lines changed: 25 additions & 25 deletions

File tree

src/iceberg/test/expire_snapshots_test.cc

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) {
376376
const auto expired_data_manifest_path = table_location_ + "/metadata/expired-data.avro";
377377
const auto expired_delete_manifest_path =
378378
table_location_ + "/metadata/expired-delete.avro";
379+
const auto missing_delete_manifest_path =
380+
table_location_ + "/metadata/missing-delete.avro";
379381
const auto expired_manifest_list_path =
380382
table_location_ + "/metadata/expired-manifest-list.avro";
381383
const auto current_manifest_list_path =
@@ -389,6 +391,7 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) {
389391
expired_delete_manifest_path, kExpiredSnapshotId,
390392
{MakeEntry(ManifestStatus::kAdded, kExpiredSnapshotId, kExpiredSequenceNumber,
391393
MakePositionDeleteFile(expired_delete_file_path))});
394+
expired_delete_manifest.manifest_path = missing_delete_manifest_path;
392395
WriteManifestList(expired_manifest_list_path, kExpiredSnapshotId,
393396
/*parent_snapshot_id=*/0, kExpiredSequenceNumber,
394397
{expired_data_manifest, expired_delete_manifest});
@@ -406,7 +409,7 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) {
406409
EXPECT_THAT(update->Commit(), IsOk());
407410
EXPECT_THAT(deleted_files, testing::UnorderedElementsAre(expired_data_file_path,
408411
expired_data_manifest_path,
409-
expired_delete_manifest_path,
412+
missing_delete_manifest_path,
410413
expired_manifest_list_path));
411414
}
412415

@@ -443,10 +446,10 @@ TEST_F(ExpireSnapshotsCleanupTest, DeletesExpiredFiles) {
443446
[&deleted_files](const std::string& path) { deleted_files.push_back(path); });
444447

445448
EXPECT_THAT(update->Commit(), IsOk());
446-
EXPECT_THAT(deleted_files, testing::UnorderedElementsAre(expired_data_file_path,
447-
expired_data_manifest_path,
448-
expired_delete_manifest_path,
449-
expired_manifest_list_path));
449+
EXPECT_THAT(deleted_files, testing::UnorderedElementsAre(
450+
expired_data_file_path, expired_delete_file_path,
451+
expired_data_manifest_path, expired_delete_manifest_path,
452+
expired_manifest_list_path));
450453
}
451454

452455
TEST_F(ExpireSnapshotsCleanupTest, ExecutorDispatchesDeletesConcurrently) {

src/iceberg/update/expire_snapshots.cc

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,10 @@ class ReachableFileCleanup : public FileCleanupStrategy {
236236

237237
if (!manifests_to_delete.empty()) {
238238
if (level == CleanupLevel::kAll) {
239-
// Deleting data files
240-
auto data_files_to_delete = FindDataFilesToDelete(
239+
// Deleting data and delete files.
240+
auto files_to_delete = FindFilesToDelete(
241241
metadata_before_expiration, manifests_to_delete, current_manifests);
242-
DeleteFiles(data_files_to_delete);
242+
DeleteFiles(files_to_delete);
243243
}
244244

245245
// Deleting manifest files
@@ -300,25 +300,22 @@ class ReachableFileCleanup : public FileCleanupStrategy {
300300
return manifests_to_delete;
301301
}
302302

303-
Result<std::unordered_set<std::string>> ReadLiveDataFilePaths(
303+
Result<std::unordered_set<std::string>> ReadLiveFilePaths(
304304
const TableMetadata& metadata, const ManifestFile& manifest) {
305-
ICEBERG_PRECHECK(manifest.content == ManifestContent::kData,
306-
"Cannot read data file paths from a delete manifest: {}",
307-
manifest.manifest_path);
308-
309305
// TODO(shangxinli): optimize by only reading file paths
310306
ICEBERG_ASSIGN_OR_RAISE(auto reader,
311307
MakeManifestReader(manifest, file_io_, metadata));
308+
reader->Select({"file_path"});
312309
ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->LiveEntries());
313310

314-
std::unordered_set<std::string> data_file_paths;
311+
std::unordered_set<std::string> file_paths;
315312
for (const auto& entry : entries) {
316313
if (entry.data_file) {
317-
data_file_paths.insert(entry.data_file->file_path);
314+
file_paths.insert(entry.data_file->file_path);
318315
}
319316
}
320317

321-
return data_file_paths;
318+
return file_paths;
322319
}
323320

324321
/// \brief Project manifests to manifest paths for deletion.
@@ -332,8 +329,8 @@ class ReachableFileCleanup : public FileCleanupStrategy {
332329
return manifest_paths;
333330
}
334331

335-
/// \brief Find data files to delete from manifests being removed.
336-
std::unordered_set<std::string> FindDataFilesToDelete(
332+
/// \brief Find files to delete from manifests being removed.
333+
std::unordered_set<std::string> FindFilesToDelete(
337334
const TableMetadata& metadata,
338335
const std::unordered_set<ManifestFile>& manifests_to_delete,
339336
const std::unordered_set<ManifestFile>& current_manifests) {
@@ -342,7 +339,7 @@ class ReachableFileCleanup : public FileCleanupStrategy {
342339
plan_executor_, manifests_to_delete,
343340
[this, &metadata](
344341
const ManifestFile& manifest) -> Result<std::unordered_set<std::string>> {
345-
auto result = ReadLiveDataFilePaths(metadata, manifest);
342+
auto result = ReadLiveFilePaths(metadata, manifest);
346343
// Ignore expired-manifest read failures and keep scanning candidates.
347344
if (!result.has_value()) {
348345
return std::unordered_set<std::string>{};
@@ -352,26 +349,26 @@ class ReachableFileCleanup : public FileCleanupStrategy {
352349
if (!live_file_results.has_value()) {
353350
return {};
354351
}
355-
auto data_files_to_delete = std::move(live_file_results).value();
356-
if (data_files_to_delete.empty()) {
357-
return data_files_to_delete;
352+
auto files_to_delete = std::move(live_file_results).value();
353+
if (files_to_delete.empty()) {
354+
return files_to_delete;
358355
}
359356

360357
// Remove files still referenced by current manifests.
361358
auto current_live_results =
362359
ParallelCollect(plan_executor_, current_manifests,
363360
[this, &metadata](const ManifestFile& manifest) {
364-
return ReadLiveDataFilePaths(metadata, manifest);
361+
return ReadLiveFilePaths(metadata, manifest);
365362
});
366363
// Fail closed if any retained manifest cannot be read safely.
367364
if (!current_live_results.has_value()) {
368365
return std::unordered_set<std::string>{};
369366
}
370367
for (const auto& file_path : current_live_results.value()) {
371-
data_files_to_delete.erase(file_path);
368+
files_to_delete.erase(file_path);
372369
}
373370

374-
return data_files_to_delete;
371+
return files_to_delete;
375372
}
376373
};
377374

0 commit comments

Comments
 (0)