Skip to content

Commit 8c66829

Browse files
committed
refactor: introduce ExpireSnapshots builder for snapshot expiration
Add ExpireSnapshots builder class to consolidate snapshot expiration operations under a fluent API. This change moves from direct method calls on MaintenanceTable to a builder pattern accessed via maintenance.expire_snapshots(). Key changes: - Add ExpireSnapshots class with fluent API methods - Migrate existing snapshot expiration functionality to builder pattern - Update API documentation examples to use new builder syntax - Update tests to use new expire_snapshots() builder methods - Maintain backward compatibility in MaintenanceTable implementation This provides a more intuitive and JAVA aligned API as suggested by @kevinjqliu and @aammar5
1 parent 6f1d1a7 commit 8c66829

File tree

3 files changed

+127
-15
lines changed

3 files changed

+127
-15
lines changed

mkdocs/docs/api.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,7 @@ Iceberg tables accumulate snapshots over time. Retaining too many can waste stor
10891089
from pyiceberg.table.maintenance import MaintenanceTable
10901090
10911091
maintenance = MaintenanceTable(table)
1092-
maintenance.retain_last_n_snapshots(5)
1092+
maintenance.expire_snapshots().retain_last_n(5)
10931093
```
10941094

10951095
#### Example: Expire snapshots older than 30 days, but keep at least 3
@@ -1100,7 +1100,7 @@ from pyiceberg.table.maintenance import MaintenanceTable
11001100
11011101
maintenance = MaintenanceTable(table)
11021102
thirty_days_ago = int((time.time() - 30 * 24 * 60 * 60) * 1000)
1103-
maintenance.expire_snapshots_with_retention_policy(
1103+
maintenance.expire_snapshots().with_retention_policy(
11041104
timestamp_ms=thirty_days_ago,
11051105
min_snapshots_to_keep=3
11061106
)
@@ -1209,7 +1209,7 @@ PyIceberg makes it easy to filter out data from a huge table and pull it into a
12091209

12101210
```python
12111211
# Expire old snapshots, but always keep last 10 and at least 5 total
1212-
maintenance.expire_snapshots_with_retention_policy(
1212+
maintenance.expire_snapshots().with_retention_policy(
12131213
timestamp_ms=thirty_days_ago,
12141214
retain_last_n=10,
12151215
min_snapshots_to_keep=5

pyiceberg/table/maintenance.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ class MaintenanceTable:
3636
def __init__(self, tbl: Table) -> None:
3737
self.tbl = tbl
3838

39+
def expire_snapshots(self) -> "ExpireSnapshots":
40+
"""Return an ExpireSnapshots builder for snapshot expiration operations.
41+
42+
Returns:
43+
ExpireSnapshots builder for configuring and executing snapshot expiration.
44+
"""
45+
return ExpireSnapshots(self.tbl)
46+
3947
def expire_snapshot_by_id(self, snapshot_id: int) -> None:
4048
"""Expire a single snapshot by its ID.
4149
@@ -332,3 +340,107 @@ def _get_expiration_properties(self) -> tuple[Optional[int], Optional[int], Opti
332340
max_ref_age_ms = int(max_ref_age) if max_ref_age is not None else None
333341

334342
return max_snapshot_age, min_snapshots_to_keep, max_ref_age_ms
343+
344+
345+
class ExpireSnapshots:
346+
"""Builder for snapshot expiration operations."""
347+
348+
def __init__(self, tbl: Table) -> None:
349+
self.tbl = tbl
350+
self.maintenance = MaintenanceTable(tbl)
351+
352+
def by_id(self, snapshot_id: int) -> None:
353+
"""Expire a single snapshot by its ID.
354+
355+
Args:
356+
snapshot_id: The ID of the snapshot to expire.
357+
358+
Raises:
359+
ValueError: If the snapshot does not exist or is protected.
360+
"""
361+
self.maintenance.expire_snapshot_by_id(snapshot_id)
362+
363+
def by_ids(self, snapshot_ids: List[int]) -> None:
364+
"""Expire multiple snapshots by their IDs.
365+
366+
Args:
367+
snapshot_ids: List of snapshot IDs to expire.
368+
369+
Raises:
370+
ValueError: If any snapshot does not exist or is protected.
371+
"""
372+
self.maintenance._expire_snapshots_by_ids(snapshot_ids)
373+
374+
def older_than(self, timestamp_ms: int) -> None:
375+
"""Expire all unprotected snapshots with a timestamp older than a given value.
376+
377+
Args:
378+
timestamp_ms: Only snapshots with timestamp_ms < this value will be expired.
379+
"""
380+
self.maintenance.expire_snapshots_older_than(timestamp_ms)
381+
382+
def older_than_with_retention(
383+
self, timestamp_ms: int, retain_last_n: Optional[int] = None, min_snapshots_to_keep: Optional[int] = None
384+
) -> None:
385+
"""Expire all unprotected snapshots with a timestamp older than a given value, with retention strategies.
386+
387+
Args:
388+
timestamp_ms: Only snapshots with timestamp_ms < this value will be expired.
389+
retain_last_n: Always keep the last N snapshots regardless of age.
390+
min_snapshots_to_keep: Minimum number of snapshots to keep in total.
391+
"""
392+
self.maintenance.expire_snapshots_older_than_with_retention(
393+
timestamp_ms=timestamp_ms, retain_last_n=retain_last_n, min_snapshots_to_keep=min_snapshots_to_keep
394+
)
395+
396+
def with_retention_policy(
397+
self, timestamp_ms: Optional[int] = None, retain_last_n: Optional[int] = None, min_snapshots_to_keep: Optional[int] = None
398+
) -> None:
399+
"""Comprehensive snapshot expiration with multiple retention strategies.
400+
401+
This method provides a unified interface for snapshot expiration with various
402+
retention policies to ensure operational resilience while allowing space reclamation.
403+
404+
The method will use table properties as defaults if they are set:
405+
- history.expire.max-snapshot-age-ms: Default for timestamp_ms if not provided
406+
- history.expire.min-snapshots-to-keep: Default for min_snapshots_to_keep if not provided
407+
- history.expire.max-ref-age-ms: Used for ref expiration (branches/tags)
408+
409+
Args:
410+
timestamp_ms: Only snapshots with timestamp_ms < this value will be considered for expiration.
411+
If None, will use history.expire.max-snapshot-age-ms table property if set.
412+
retain_last_n: Always keep the last N snapshots regardless of age.
413+
Useful when regular snapshot creation occurs and users want to keep
414+
the last few for rollback purposes.
415+
min_snapshots_to_keep: Minimum number of snapshots to keep in total.
416+
Acts as a guardrail to prevent aggressive expiration logic.
417+
If None, will use history.expire.min-snapshots-to-keep table property if set.
418+
419+
Raises:
420+
ValueError: If retain_last_n or min_snapshots_to_keep is less than 1.
421+
422+
Examples:
423+
# Use table property defaults
424+
table.maintenance().expire_snapshots().with_retention_policy()
425+
426+
# Override defaults with explicit values
427+
table.maintenance().expire_snapshots().with_retention_policy(
428+
timestamp_ms=1234567890000,
429+
retain_last_n=10,
430+
min_snapshots_to_keep=5
431+
)
432+
"""
433+
self.maintenance.expire_snapshots_with_retention_policy(
434+
timestamp_ms=timestamp_ms, retain_last_n=retain_last_n, min_snapshots_to_keep=min_snapshots_to_keep
435+
)
436+
437+
def retain_last_n(self, n: int) -> None:
438+
"""Keep only the last N snapshots, expiring all others.
439+
440+
Args:
441+
n: Number of most recent snapshots to keep.
442+
443+
Raises:
444+
ValueError: If n is less than 1.
445+
"""
446+
self.maintenance.retain_last_n_snapshots(n)

tests/table/test_retention_strategies.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_retain_last_n_snapshots(table_v2: Table) -> None:
5252
uuid=uuid4(),
5353
)
5454
table_v2.catalog.commit_table.return_value = mock_response
55-
table_v2.maintenance.retain_last_n_snapshots(3)
55+
table_v2.maintenance.expire_snapshots().retain_last_n(3)
5656
table_v2.catalog.commit_table.assert_called_once()
5757
# Update metadata to reflect commit
5858
table_v2.metadata = mock_response.metadata
@@ -93,7 +93,7 @@ def test_min_snapshots_to_keep(table_v2: Table) -> None:
9393
uuid=uuid4(),
9494
)
9595
table_v2.catalog.commit_table.return_value = mock_response
96-
table_v2.maintenance.expire_snapshots_older_than_with_retention(timestamp_ms=4500, min_snapshots_to_keep=3)
96+
table_v2.maintenance.expire_snapshots().older_than_with_retention(timestamp_ms=4500, min_snapshots_to_keep=3)
9797
table_v2.catalog.commit_table.assert_called_once()
9898
table_v2.metadata = mock_response.metadata
9999
remaining_ids = {s.snapshot_id for s in table_v2.metadata.snapshots}
@@ -128,7 +128,7 @@ def test_combined_constraints(table_v2: Table) -> None:
128128
uuid=uuid4(),
129129
)
130130
table_v2.catalog.commit_table.return_value = mock_response
131-
table_v2.maintenance.expire_snapshots_with_retention_policy(timestamp_ms=3500, retain_last_n=2, min_snapshots_to_keep=4)
131+
table_v2.maintenance.expire_snapshots().with_retention_policy(timestamp_ms=3500, retain_last_n=2, min_snapshots_to_keep=4)
132132
table_v2.catalog.commit_table.assert_called_once()
133133
table_v2.metadata = mock_response.metadata
134134
remaining_ids = {s.snapshot_id for s in table_v2.metadata.snapshots}
@@ -156,7 +156,7 @@ def test_cannot_expire_protected_head_snapshot(table_v2: Table) -> None:
156156

157157
# Attempt to expire the HEAD snapshot and expect a ValueError
158158
with pytest.raises(ValueError, match=f"Snapshot with ID {HEAD_SNAPSHOT} is protected and cannot be expired."):
159-
table_v2.maintenance.expire_snapshot_by_id(HEAD_SNAPSHOT)
159+
table_v2.maintenance.expire_snapshots().by_id(HEAD_SNAPSHOT)
160160

161161
table_v2.catalog.commit_table.assert_not_called()
162162

@@ -179,7 +179,7 @@ def test_cannot_expire_tagged_snapshot(table_v2: Table) -> None:
179179
assert any(ref.snapshot_id == TAGGED_SNAPSHOT for ref in table_v2.metadata.refs.values())
180180

181181
with pytest.raises(ValueError, match=f"Snapshot with ID {TAGGED_SNAPSHOT} is protected and cannot be expired."):
182-
table_v2.maintenance.expire_snapshot_by_id(TAGGED_SNAPSHOT)
182+
table_v2.maintenance.expire_snapshots().by_id(TAGGED_SNAPSHOT)
183183

184184
table_v2.catalog.commit_table.assert_not_called()
185185

@@ -211,7 +211,7 @@ def test_expire_unprotected_snapshot(table_v2: Table) -> None:
211211
assert all(ref.snapshot_id != EXPIRE_SNAPSHOT for ref in table_v2.metadata.refs.values())
212212

213213
# Expire the snapshot
214-
table_v2.maintenance.expire_snapshot_by_id(EXPIRE_SNAPSHOT)
214+
table_v2.maintenance.expire_snapshots().by_id(EXPIRE_SNAPSHOT)
215215

216216
table_v2.catalog.commit_table.assert_called_once()
217217
remaining_snapshots = table_v2.metadata.snapshots
@@ -227,7 +227,7 @@ def test_expire_nonexistent_snapshot_raises(table_v2: Table) -> None:
227227
table_v2.metadata = table_v2.metadata.model_copy(update={"refs": {}})
228228

229229
with pytest.raises(ValueError, match=f"Snapshot with ID {NONEXISTENT_SNAPSHOT} does not exist."):
230-
table_v2.maintenance.expire_snapshot_by_id(NONEXISTENT_SNAPSHOT)
230+
table_v2.maintenance.expire_snapshots().by_id(NONEXISTENT_SNAPSHOT)
231231

232232
table_v2.catalog.commit_table.assert_not_called()
233233

@@ -265,7 +265,7 @@ def test_expire_snapshots_by_timestamp_skips_protected(table_v2: Table) -> None:
265265
)
266266
table_v2.catalog.commit_table.return_value = mock_response
267267

268-
table_v2.maintenance.expire_snapshots_older_than(future_timestamp)
268+
table_v2.maintenance.expire_snapshots().older_than(future_timestamp)
269269

270270
# Both protected snapshots should remain
271271
remaining_ids = {s.snapshot_id for s in table_v2.metadata.snapshots}
@@ -326,7 +326,7 @@ def test_expire_snapshots_by_ids(table_v2: Table) -> None:
326326
assert all(ref.snapshot_id not in (EXPIRE_SNAPSHOT_1, EXPIRE_SNAPSHOT_2) for ref in table_v2.metadata.refs.values())
327327

328328
# Expire the snapshots
329-
table_v2.maintenance._expire_snapshots_by_ids([EXPIRE_SNAPSHOT_1, EXPIRE_SNAPSHOT_2])
329+
table_v2.maintenance.expire_snapshots().by_ids([EXPIRE_SNAPSHOT_1, EXPIRE_SNAPSHOT_2])
330330

331331
table_v2.catalog.commit_table.assert_called_once()
332332
remaining_snapshots = table_v2.metadata.snapshots
@@ -380,7 +380,7 @@ def test_expire_snapshots_with_table_property_defaults(table_v2: Table) -> None:
380380
table_v2.catalog.commit_table.return_value = mock_response
381381

382382
# Call expire without explicit parameters - should use table properties
383-
table_v2.maintenance.expire_snapshots_with_retention_policy()
383+
table_v2.maintenance.expire_snapshots().with_retention_policy()
384384

385385
table_v2.catalog.commit_table.assert_called_once()
386386
table_v2.metadata = mock_response.metadata
@@ -425,7 +425,7 @@ def test_explicit_parameters_override_table_properties(table_v2: Table) -> None:
425425
table_v2.catalog.commit_table.return_value = mock_response
426426

427427
# Call expire with explicit parameters that should override the properties
428-
table_v2.maintenance.expire_snapshots_with_retention_policy(
428+
table_v2.maintenance.expire_snapshots().with_retention_policy(
429429
timestamp_ms=1500, # Only expire snapshots older than this
430430
min_snapshots_to_keep=4, # Keep at least 4 snapshots (overrides property of 2)
431431
)
@@ -458,7 +458,7 @@ def test_expire_snapshots_no_properties_no_parameters(table_v2: Table) -> None:
458458
table_v2.catalog = MagicMock()
459459

460460
# Call expire with no parameters
461-
table_v2.maintenance.expire_snapshots_with_retention_policy()
461+
table_v2.maintenance.expire_snapshots().with_retention_policy()
462462

463463
# Should not attempt to expire anything since no criteria were provided
464464
table_v2.catalog.commit_table.assert_not_called()

0 commit comments

Comments
 (0)