Skip to content

Commit 4d4b173

Browse files
committed
Refactor unit tests and test non-positive size
1 parent a13caf3 commit 4d4b173

File tree

1 file changed

+122
-83
lines changed

1 file changed

+122
-83
lines changed

tests/utils/test_manifest.py

Lines changed: 122 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
# under the License.
1717
# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818
import importlib
19+
from pathlib import Path
1920
from tempfile import TemporaryDirectory
21+
from typing import Any
2022

2123
import fastavro
2224
import pytest
@@ -937,52 +939,56 @@ def test_manifest_writer_tell(format_version: TableVersion) -> None:
937939
assert after_entry_bytes > initial_bytes, "Bytes should increase after adding entry"
938940

939941

942+
def _create_test_manifest_list(module: Any, io: PyArrowFileIO, tmp_dir: str, name: str, snapshot_id: int) -> str:
943+
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
944+
spec = UNPARTITIONED_PARTITION_SPEC
945+
946+
manifest_path = f"{tmp_dir}/manifest-{name}.avro"
947+
with module.write_manifest(
948+
format_version=2,
949+
spec=spec,
950+
schema=schema,
951+
output_file=io.new_output(manifest_path),
952+
snapshot_id=snapshot_id,
953+
avro_compression="zstandard",
954+
) as writer:
955+
data_file = module.DataFile.from_args(
956+
content=module.DataFileContent.DATA,
957+
file_path=f"{tmp_dir}/data-{name}.parquet",
958+
file_format=module.FileFormat.PARQUET,
959+
partition=Record(),
960+
record_count=100,
961+
file_size_in_bytes=1000,
962+
)
963+
writer.add_entry(
964+
module.ManifestEntry.from_args(
965+
status=module.ManifestEntryStatus.ADDED,
966+
snapshot_id=snapshot_id,
967+
data_file=data_file,
968+
)
969+
)
970+
manifest_file = writer.to_manifest_file()
971+
972+
list_path = f"{tmp_dir}/manifest-list-{name}.avro"
973+
with module.write_manifest_list(
974+
format_version=2,
975+
output_file=io.new_output(list_path),
976+
snapshot_id=snapshot_id,
977+
parent_snapshot_id=snapshot_id - 1 if snapshot_id > 1 else None,
978+
sequence_number=snapshot_id,
979+
avro_compression="zstandard",
980+
) as list_writer:
981+
list_writer.add_manifests([manifest_file])
982+
983+
return list_path
984+
985+
940986
def test_clear_manifest_cache() -> None:
941987
"""Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
942988
io = PyArrowFileIO()
943989

944990
with TemporaryDirectory() as tmp_dir:
945-
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
946-
spec = UNPARTITIONED_PARTITION_SPEC
947-
948-
# Create a manifest file
949-
manifest_path = f"{tmp_dir}/manifest.avro"
950-
with write_manifest(
951-
format_version=2,
952-
spec=spec,
953-
schema=schema,
954-
output_file=io.new_output(manifest_path),
955-
snapshot_id=1,
956-
avro_compression="zstandard",
957-
) as writer:
958-
data_file = DataFile.from_args(
959-
content=DataFileContent.DATA,
960-
file_path=f"{tmp_dir}/data.parquet",
961-
file_format=FileFormat.PARQUET,
962-
partition=Record(),
963-
record_count=100,
964-
file_size_in_bytes=1000,
965-
)
966-
writer.add_entry(
967-
ManifestEntry.from_args(
968-
status=ManifestEntryStatus.ADDED,
969-
snapshot_id=1,
970-
data_file=data_file,
971-
)
972-
)
973-
manifest_file = writer.to_manifest_file()
974-
975-
# Create a manifest list
976-
list_path = f"{tmp_dir}/manifest-list.avro"
977-
with write_manifest_list(
978-
format_version=2,
979-
output_file=io.new_output(list_path),
980-
snapshot_id=1,
981-
parent_snapshot_id=None,
982-
sequence_number=1,
983-
avro_compression="zstandard",
984-
) as list_writer:
985-
list_writer.add_manifests([manifest_file])
991+
list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="clear", snapshot_id=1)
986992

987993
# Populate the cache
988994
_manifests(io, list_path)
@@ -1001,9 +1007,10 @@ def test_clear_manifest_cache() -> None:
10011007
assert len(cache_after) == 0, "Cache should be empty after clear"
10021008

10031009

1004-
def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.MonkeyPatch) -> None:
1005-
"""Test that setting manifest-cache-size to 0 disables caching."""
1006-
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "0")
1010+
@pytest.mark.parametrize("cache_size", ["0", "-1"])
1011+
def test_manifest_cache_can_be_disabled_with_non_positive_size(monkeypatch: pytest.MonkeyPatch, cache_size: str) -> None:
1012+
"""Test that non-positive manifest-cache-size values disable caching."""
1013+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", cache_size)
10071014
importlib.reload(manifest_module)
10081015

10091016
try:
@@ -1013,45 +1020,7 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10131020
io = PyArrowFileIO()
10141021

10151022
with TemporaryDirectory() as tmp_dir:
1016-
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
1017-
spec = UNPARTITIONED_PARTITION_SPEC
1018-
1019-
manifest_path = f"{tmp_dir}/manifest.avro"
1020-
with manifest_module.write_manifest(
1021-
format_version=2,
1022-
spec=spec,
1023-
schema=schema,
1024-
output_file=io.new_output(manifest_path),
1025-
snapshot_id=1,
1026-
avro_compression="zstandard",
1027-
) as writer:
1028-
data_file = manifest_module.DataFile.from_args(
1029-
content=manifest_module.DataFileContent.DATA,
1030-
file_path=f"{tmp_dir}/data.parquet",
1031-
file_format=manifest_module.FileFormat.PARQUET,
1032-
partition=Record(),
1033-
record_count=100,
1034-
file_size_in_bytes=1000,
1035-
)
1036-
writer.add_entry(
1037-
manifest_module.ManifestEntry.from_args(
1038-
status=manifest_module.ManifestEntryStatus.ADDED,
1039-
snapshot_id=1,
1040-
data_file=data_file,
1041-
)
1042-
)
1043-
manifest_file = writer.to_manifest_file()
1044-
1045-
list_path = f"{tmp_dir}/manifest-list.avro"
1046-
with manifest_module.write_manifest_list(
1047-
format_version=2,
1048-
output_file=io.new_output(list_path),
1049-
snapshot_id=1,
1050-
parent_snapshot_id=None,
1051-
sequence_number=1,
1052-
avro_compression="zstandard",
1053-
) as list_writer:
1054-
list_writer.add_manifests([manifest_file])
1023+
list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="disabled", snapshot_id=1)
10551024

10561025
manifests_first_call = manifest_module._manifests(io, list_path)
10571026
manifests_second_call = manifest_module._manifests(io, list_path)
@@ -1061,3 +1030,73 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10611030
finally:
10621031
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
10631032
importlib.reload(manifest_module)
1033+
1034+
1035+
def test_manifest_cache_respects_positive_env_size(monkeypatch: pytest.MonkeyPatch) -> None:
1036+
"""Test that a positive manifest-cache-size enables a bounded cache."""
1037+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "1")
1038+
importlib.reload(manifest_module)
1039+
1040+
try:
1041+
assert manifest_module._manifest_cache_size == 1
1042+
1043+
io = PyArrowFileIO()
1044+
1045+
with TemporaryDirectory() as tmp_dir:
1046+
first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1)
1047+
second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2)
1048+
1049+
manifests_first_call = manifest_module._manifests(io, first_list_path)
1050+
manifests_second_call = manifest_module._manifests(io, first_list_path)
1051+
1052+
assert manifests_first_call[0] is manifests_second_call[0]
1053+
assert len(manifest_module._manifest_cache) == 1
1054+
1055+
manifest_module._manifests(io, second_list_path)
1056+
1057+
assert len(manifest_module._manifest_cache) == 1
1058+
finally:
1059+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1060+
importlib.reload(manifest_module)
1061+
1062+
1063+
def test_manifest_cache_reads_size_from_configuration_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
1064+
"""Test that manifest-cache-size can be loaded from .pyiceberg.yaml."""
1065+
config_dir = tmp_path / "config"
1066+
config_dir.mkdir()
1067+
(config_dir / ".pyiceberg.yaml").write_text("manifest-cache-size: 2\n", encoding="utf-8")
1068+
1069+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1070+
monkeypatch.setenv("PYICEBERG_HOME", str(config_dir))
1071+
importlib.reload(manifest_module)
1072+
1073+
try:
1074+
assert manifest_module._manifest_cache_size == 2
1075+
1076+
io = PyArrowFileIO()
1077+
1078+
with TemporaryDirectory() as tmp_dir:
1079+
first_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="first", snapshot_id=1)
1080+
second_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="second", snapshot_id=2)
1081+
third_list_path = _create_test_manifest_list(manifest_module, io, tmp_dir, name="third", snapshot_id=3)
1082+
1083+
manifest_module._manifests(io, first_list_path)
1084+
manifest_module._manifests(io, second_list_path)
1085+
manifest_module._manifests(io, third_list_path)
1086+
1087+
assert len(manifest_module._manifest_cache) == 2
1088+
finally:
1089+
monkeypatch.delenv("PYICEBERG_HOME", raising=False)
1090+
importlib.reload(manifest_module)
1091+
1092+
1093+
def test_invalid_manifest_cache_size_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None:
1094+
"""Test that invalid manifest-cache-size values raise a helpful error."""
1095+
monkeypatch.setenv("PYICEBERG_MANIFEST_CACHE_SIZE", "not-an-int")
1096+
1097+
try:
1098+
with pytest.raises(ValueError, match="manifest-cache-size should be an integer or left unset"):
1099+
importlib.reload(manifest_module)
1100+
finally:
1101+
monkeypatch.delenv("PYICEBERG_MANIFEST_CACHE_SIZE", raising=False)
1102+
importlib.reload(manifest_module)

0 commit comments

Comments
 (0)