Skip to content

Commit 7a31887

Browse files
fix(table): validate snapshot-log timestamps in metadata parsing
1 parent 8f5525b commit 7a31887

File tree

2 files changed

+57
-1
lines changed

2 files changed

+57
-1
lines changed

pyiceberg/table/metadata.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
DEFAULT_SCHEMA_ID = 0
6868

6969
SUPPORTED_TABLE_FORMAT_VERSION = 2
70+
ONE_MINUTE_MS = 60_000
7071

7172

7273
def cleanup_snapshot_id(data: dict[str, Any]) -> dict[str, Any]:
@@ -125,6 +126,29 @@ def construct_refs(table_metadata: TableMetadata) -> TableMetadata:
125126
return table_metadata
126127

127128

129+
def check_snapshot_timestamps(table_metadata: TableMetadata) -> TableMetadata:
130+
"""Validate snapshot and snapshot-log timestamps with small clock skew tolerance."""
131+
last_snapshot_log_entry: SnapshotLogEntry | None = None
132+
for snapshot_log_entry in table_metadata.snapshot_log:
133+
if (
134+
last_snapshot_log_entry is not None
135+
and snapshot_log_entry.timestamp_ms - last_snapshot_log_entry.timestamp_ms < -ONE_MINUTE_MS
136+
):
137+
raise ValidationError("[BUG] Expected sorted snapshot log entries.")
138+
last_snapshot_log_entry = snapshot_log_entry
139+
140+
if (
141+
last_snapshot_log_entry is not None
142+
and table_metadata.last_updated_ms - last_snapshot_log_entry.timestamp_ms < -ONE_MINUTE_MS
143+
):
144+
raise ValidationError(
145+
f"Invalid update timestamp {table_metadata.last_updated_ms}: "
146+
f"before last snapshot log entry at {last_snapshot_log_entry.timestamp_ms}"
147+
)
148+
149+
return table_metadata
150+
151+
128152
class TableMetadataCommonFields(IcebergBaseModel):
129153
"""Metadata for an Iceberg table as specified in the Apache Iceberg spec.
130154
@@ -378,6 +402,10 @@ def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]:
378402
def construct_refs(self) -> TableMetadataV1:
379403
return construct_refs(self)
380404

405+
@model_validator(mode="after")
406+
def check_snapshot_timestamps(self) -> TableMetadata:
407+
return check_snapshot_timestamps(self)
408+
381409
@model_validator(mode="before")
382410
def set_v2_compatible_defaults(cls, data: dict[str, Any]) -> dict[str, Any]:
383411
"""Set default values to be compatible with the format v2.
@@ -519,6 +547,10 @@ def check_sort_orders(self) -> TableMetadata:
519547
def construct_refs(self) -> TableMetadata:
520548
return construct_refs(self)
521549

550+
@model_validator(mode="after")
551+
def check_snapshot_timestamps(self) -> TableMetadata:
552+
return check_snapshot_timestamps(self)
553+
522554
format_version: Literal[2] = Field(alias="format-version", default=2)
523555
"""An integer version number for the format. Implementations must throw
524556
an exception if a table’s version is higher than the supported version."""
@@ -563,6 +595,10 @@ def check_sort_orders(self) -> TableMetadata:
563595
def construct_refs(self) -> TableMetadata:
564596
return construct_refs(self)
565597

598+
@model_validator(mode="after")
599+
def check_snapshot_timestamps(self) -> TableMetadata:
600+
return check_snapshot_timestamps(self)
601+
566602
format_version: Literal[3] = Field(alias="format-version", default=3)
567603
"""An integer version number for the format. Implementations must throw
568604
an exception if a table’s version is higher than the supported version."""

tests/table/test_metadata.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import io
2020
import json
21-
from copy import copy
21+
from copy import copy, deepcopy
2222
from typing import Any
2323
from unittest.mock import MagicMock, patch
2424
from uuid import UUID
@@ -415,6 +415,26 @@ def test_sort_order_unsorted() -> None:
415415
assert len(table_metadata.sort_orders) == 0
416416

417417

418+
def test_snapshot_log_entries_are_sorted_with_tolerance(example_table_metadata_v2: dict[str, Any]) -> None:
419+
table_metadata = deepcopy(example_table_metadata_v2)
420+
table_metadata["snapshot-log"][1]["timestamp-ms"] = table_metadata["snapshot-log"][0]["timestamp-ms"] - 60_001
421+
422+
with pytest.raises(ValidationError) as exc_info:
423+
TableMetadataUtil.parse_raw(json.dumps(table_metadata))
424+
425+
assert "Expected sorted snapshot log entries" in str(exc_info.value)
426+
427+
428+
def test_last_updated_ms_not_before_last_snapshot_log_entry(example_table_metadata_v2: dict[str, Any]) -> None:
429+
table_metadata = deepcopy(example_table_metadata_v2)
430+
table_metadata["last-updated-ms"] = table_metadata["snapshot-log"][-1]["timestamp-ms"] - 60_001
431+
432+
with pytest.raises(ValidationError) as exc_info:
433+
TableMetadataUtil.parse_raw(json.dumps(table_metadata))
434+
435+
assert "before last snapshot log entry at" in str(exc_info.value)
436+
437+
418438
def test_invalid_partition_spec() -> None:
419439
table_metadata_spec_not_found = {
420440
"format-version": 2,

0 commit comments

Comments
 (0)