Skip to content

Commit b6d6c81

Browse files
committed
Add unit tests for manifest cache configs
1 parent f22614b commit b6d6c81

File tree

1 file changed

+129
-9
lines changed

1 file changed

+129
-9
lines changed

tests/utils/test_manifest.py

Lines changed: 129 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
# under the License.
1717
# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818
from tempfile import TemporaryDirectory
19+
from unittest import mock
1920

2021
import fastavro
2122
import pytest
2223

24+
import pyiceberg.manifest as manifest_module
2325
from pyiceberg.avro.codecs import AvroCompressionCodec
2426
from pyiceberg.io import load_file_io
2527
from pyiceberg.io.pyarrow import PyArrowFileIO
@@ -32,8 +34,9 @@
3234
ManifestEntryStatus,
3335
ManifestFile,
3436
PartitionFieldSummary,
35-
_manifest_cache,
37+
_get_manifest_cache,
3638
_manifests,
39+
clear_manifest_cache,
3740
read_manifest_list,
3841
write_manifest,
3942
write_manifest_list,
@@ -46,9 +49,10 @@
4649

4750

4851
@pytest.fixture(autouse=True)
49-
def clear_global_manifests_cache() -> None:
50-
# Clear the global cache before each test
51-
_manifest_cache.clear()
52+
def reset_global_manifests_cache() -> None:
53+
# Reset cache state before each test so config is re-read
54+
manifest_module._manifest_cache_manager._cache = None
55+
manifest_module._manifest_cache_manager._initialized = False
5256

5357

5458
def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: dict[str, str]) -> None:
@@ -804,9 +808,9 @@ def test_manifest_cache_deduplicates_manifest_files() -> None:
804808

805809
# Verify cache size - should only have 3 unique ManifestFile objects
806810
# instead of 1 + 2 + 3 = 6 objects as with the old approach
807-
assert len(_manifest_cache) == 3, (
808-
f"Cache should contain exactly 3 unique ManifestFile objects, but has {len(_manifest_cache)}"
809-
)
811+
cache = _get_manifest_cache()
812+
assert cache is not None, "Manifest cache should be enabled for this test"
813+
assert len(cache) == 3, f"Cache should contain exactly 3 unique ManifestFile objects, but has {len(cache)}"
810814

811815

812816
def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
@@ -879,9 +883,11 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
879883
# With the new approach, we should have exactly N objects
880884

881885
# Verify cache has exactly N unique entries
882-
assert len(_manifest_cache) == num_manifests, (
886+
cache = _get_manifest_cache()
887+
assert cache is not None, "Manifest cache should be enabled for this test"
888+
assert len(cache) == num_manifests, (
883889
f"Cache should contain exactly {num_manifests} ManifestFile objects, "
884-
f"but has {len(_manifest_cache)}. "
890+
f"but has {len(cache)}. "
885891
f"Old approach would have {num_manifests * (num_manifests + 1) // 2} objects."
886892
)
887893

@@ -897,3 +903,117 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
897903
if len(references) > 1:
898904
for ref in references[1:]:
899905
assert ref is references[0], f"All references to manifest {i} should be the same object instance"
906+
907+
908+
def test_clear_manifest_cache() -> None:
909+
"""Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
910+
io = PyArrowFileIO()
911+
912+
with TemporaryDirectory() as tmp_dir:
913+
schema = Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=True))
914+
spec = UNPARTITIONED_PARTITION_SPEC
915+
916+
# Create a manifest file
917+
manifest_path = f"{tmp_dir}/manifest.avro"
918+
with write_manifest(
919+
format_version=2,
920+
spec=spec,
921+
schema=schema,
922+
output_file=io.new_output(manifest_path),
923+
snapshot_id=1,
924+
avro_compression="zstandard",
925+
) as writer:
926+
data_file = DataFile.from_args(
927+
content=DataFileContent.DATA,
928+
file_path=f"{tmp_dir}/data.parquet",
929+
file_format=FileFormat.PARQUET,
930+
partition=Record(),
931+
record_count=100,
932+
file_size_in_bytes=1000,
933+
)
934+
writer.add_entry(
935+
ManifestEntry.from_args(
936+
status=ManifestEntryStatus.ADDED,
937+
snapshot_id=1,
938+
data_file=data_file,
939+
)
940+
)
941+
manifest_file = writer.to_manifest_file()
942+
943+
# Create a manifest list
944+
list_path = f"{tmp_dir}/manifest-list.avro"
945+
with write_manifest_list(
946+
format_version=2,
947+
output_file=io.new_output(list_path),
948+
snapshot_id=1,
949+
parent_snapshot_id=None,
950+
sequence_number=1,
951+
avro_compression="zstandard",
952+
) as list_writer:
953+
list_writer.add_manifests([manifest_file])
954+
955+
# Populate the cache
956+
_manifests(io, list_path)
957+
958+
# Verify cache has entries
959+
cache = _get_manifest_cache()
960+
assert cache is not None, "Cache should be enabled"
961+
assert len(cache) > 0, "Cache should have entries after reading manifests"
962+
963+
# Clear the cache
964+
clear_manifest_cache()
965+
966+
# Verify cache is empty but still enabled
967+
cache_after = _get_manifest_cache()
968+
assert cache_after is not None, "Cache should still be enabled after clear"
969+
assert len(cache_after) == 0, "Cache should be empty after clear"
970+
971+
972+
@pytest.mark.parametrize(
973+
"env_vars,expected_enabled,expected_size",
974+
[
975+
({}, True, 128), # defaults
976+
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "true"}, True, 128),
977+
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "false"}, False, 128),
978+
({"PYICEBERG_MANIFEST__CACHE__SIZE": "64"}, True, 64),
979+
({"PYICEBERG_MANIFEST__CACHE__SIZE": "256"}, True, 256),
980+
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "false", "PYICEBERG_MANIFEST__CACHE__SIZE": "64"}, False, 64),
981+
],
982+
)
983+
def test_manifest_cache_config_valid_values(env_vars: dict[str, str], expected_enabled: bool, expected_size: int) -> None:
984+
"""Test that valid config values are applied correctly."""
985+
import os
986+
987+
with mock.patch.dict(os.environ, env_vars, clear=False):
988+
# Reset cache state so config is re-read
989+
manifest_module._manifest_cache_manager._cache = None
990+
manifest_module._manifest_cache_manager._initialized = False
991+
cache = _get_manifest_cache()
992+
993+
if expected_enabled:
994+
assert cache is not None, "Cache should be enabled"
995+
assert cache.maxsize == expected_size, f"Cache size should be {expected_size}"
996+
else:
997+
assert cache is None, "Cache should be disabled"
998+
999+
1000+
@pytest.mark.parametrize(
1001+
"env_vars,expected_error_substring",
1002+
[
1003+
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "maybe"}, "manifest.cache.enabled should be a boolean"),
1004+
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "invalid"}, "manifest.cache.enabled should be a boolean"),
1005+
({"PYICEBERG_MANIFEST__CACHE__SIZE": "abc"}, "manifest.cache.size should be a positive integer"),
1006+
({"PYICEBERG_MANIFEST__CACHE__SIZE": "0"}, "manifest.cache.size must be >= 1"),
1007+
({"PYICEBERG_MANIFEST__CACHE__SIZE": "-5"}, "manifest.cache.size must be >= 1"),
1008+
],
1009+
)
1010+
def test_manifest_cache_config_invalid_values(env_vars: dict[str, str], expected_error_substring: str) -> None:
1011+
"""Test that invalid config values raise ValueError with appropriate message."""
1012+
import os
1013+
1014+
with mock.patch.dict(os.environ, env_vars, clear=False):
1015+
# Reset cache state so config is re-read
1016+
manifest_module._manifest_cache_manager._cache = None
1017+
manifest_module._manifest_cache_manager._initialized = False
1018+
with pytest.raises(ValueError, match=expected_error_substring):
1019+
_get_manifest_cache()

0 commit comments

Comments
 (0)