1616# under the License.
1717# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818import importlib
19+ from pathlib import Path
1920from tempfile import TemporaryDirectory
21+ from typing import Any
2022
2123import fastavro
2224import pytest
@@ -937,52 +939,56 @@ def test_manifest_writer_tell(format_version: TableVersion) -> None:
937939 assert after_entry_bytes > initial_bytes , "Bytes should increase after adding entry"
938940
939941
942+ def _create_test_manifest_list (module : Any , io : PyArrowFileIO , tmp_dir : str , name : str , snapshot_id : int ) -> str :
943+ schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
944+ spec = UNPARTITIONED_PARTITION_SPEC
945+
946+ manifest_path = f"{ tmp_dir } /manifest-{ name } .avro"
947+ with module .write_manifest (
948+ format_version = 2 ,
949+ spec = spec ,
950+ schema = schema ,
951+ output_file = io .new_output (manifest_path ),
952+ snapshot_id = snapshot_id ,
953+ avro_compression = "zstandard" ,
954+ ) as writer :
955+ data_file = module .DataFile .from_args (
956+ content = module .DataFileContent .DATA ,
957+ file_path = f"{ tmp_dir } /data-{ name } .parquet" ,
958+ file_format = module .FileFormat .PARQUET ,
959+ partition = Record (),
960+ record_count = 100 ,
961+ file_size_in_bytes = 1000 ,
962+ )
963+ writer .add_entry (
964+ module .ManifestEntry .from_args (
965+ status = module .ManifestEntryStatus .ADDED ,
966+ snapshot_id = snapshot_id ,
967+ data_file = data_file ,
968+ )
969+ )
970+ manifest_file = writer .to_manifest_file ()
971+
972+ list_path = f"{ tmp_dir } /manifest-list-{ name } .avro"
973+ with module .write_manifest_list (
974+ format_version = 2 ,
975+ output_file = io .new_output (list_path ),
976+ snapshot_id = snapshot_id ,
977+ parent_snapshot_id = snapshot_id - 1 if snapshot_id > 1 else None ,
978+ sequence_number = snapshot_id ,
979+ avro_compression = "zstandard" ,
980+ ) as list_writer :
981+ list_writer .add_manifests ([manifest_file ])
982+
983+ return list_path
984+
985+
940986def test_clear_manifest_cache () -> None :
941987 """Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
942988 io = PyArrowFileIO ()
943989
944990 with TemporaryDirectory () as tmp_dir :
945- schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
946- spec = UNPARTITIONED_PARTITION_SPEC
947-
948- # Create a manifest file
949- manifest_path = f"{ tmp_dir } /manifest.avro"
950- with write_manifest (
951- format_version = 2 ,
952- spec = spec ,
953- schema = schema ,
954- output_file = io .new_output (manifest_path ),
955- snapshot_id = 1 ,
956- avro_compression = "zstandard" ,
957- ) as writer :
958- data_file = DataFile .from_args (
959- content = DataFileContent .DATA ,
960- file_path = f"{ tmp_dir } /data.parquet" ,
961- file_format = FileFormat .PARQUET ,
962- partition = Record (),
963- record_count = 100 ,
964- file_size_in_bytes = 1000 ,
965- )
966- writer .add_entry (
967- ManifestEntry .from_args (
968- status = ManifestEntryStatus .ADDED ,
969- snapshot_id = 1 ,
970- data_file = data_file ,
971- )
972- )
973- manifest_file = writer .to_manifest_file ()
974-
975- # Create a manifest list
976- list_path = f"{ tmp_dir } /manifest-list.avro"
977- with write_manifest_list (
978- format_version = 2 ,
979- output_file = io .new_output (list_path ),
980- snapshot_id = 1 ,
981- parent_snapshot_id = None ,
982- sequence_number = 1 ,
983- avro_compression = "zstandard" ,
984- ) as list_writer :
985- list_writer .add_manifests ([manifest_file ])
991+ list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "clear" , snapshot_id = 1 )
986992
987993 # Populate the cache
988994 _manifests (io , list_path )
@@ -1001,9 +1007,10 @@ def test_clear_manifest_cache() -> None:
10011007 assert len (cache_after ) == 0 , "Cache should be empty after clear"
10021008
10031009
1004- def test_manifest_cache_can_be_disabled_with_zero_size (monkeypatch : pytest .MonkeyPatch ) -> None :
1005- """Test that setting manifest-cache-size to 0 disables caching."""
1006- monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "0" )
1010+ @pytest .mark .parametrize ("cache_size" , ["0" , "-1" ])
1011+ def test_manifest_cache_can_be_disabled_with_non_positive_size (monkeypatch : pytest .MonkeyPatch , cache_size : str ) -> None :
1012+ """Test that non-positive manifest-cache-size values disable caching."""
1013+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , cache_size )
10071014 importlib .reload (manifest_module )
10081015
10091016 try :
@@ -1013,45 +1020,7 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10131020 io = PyArrowFileIO ()
10141021
10151022 with TemporaryDirectory () as tmp_dir :
1016- schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
1017- spec = UNPARTITIONED_PARTITION_SPEC
1018-
1019- manifest_path = f"{ tmp_dir } /manifest.avro"
1020- with manifest_module .write_manifest (
1021- format_version = 2 ,
1022- spec = spec ,
1023- schema = schema ,
1024- output_file = io .new_output (manifest_path ),
1025- snapshot_id = 1 ,
1026- avro_compression = "zstandard" ,
1027- ) as writer :
1028- data_file = manifest_module .DataFile .from_args (
1029- content = manifest_module .DataFileContent .DATA ,
1030- file_path = f"{ tmp_dir } /data.parquet" ,
1031- file_format = manifest_module .FileFormat .PARQUET ,
1032- partition = Record (),
1033- record_count = 100 ,
1034- file_size_in_bytes = 1000 ,
1035- )
1036- writer .add_entry (
1037- manifest_module .ManifestEntry .from_args (
1038- status = manifest_module .ManifestEntryStatus .ADDED ,
1039- snapshot_id = 1 ,
1040- data_file = data_file ,
1041- )
1042- )
1043- manifest_file = writer .to_manifest_file ()
1044-
1045- list_path = f"{ tmp_dir } /manifest-list.avro"
1046- with manifest_module .write_manifest_list (
1047- format_version = 2 ,
1048- output_file = io .new_output (list_path ),
1049- snapshot_id = 1 ,
1050- parent_snapshot_id = None ,
1051- sequence_number = 1 ,
1052- avro_compression = "zstandard" ,
1053- ) as list_writer :
1054- list_writer .add_manifests ([manifest_file ])
1023+ list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "disabled" , snapshot_id = 1 )
10551024
10561025 manifests_first_call = manifest_module ._manifests (io , list_path )
10571026 manifests_second_call = manifest_module ._manifests (io , list_path )
@@ -1061,3 +1030,73 @@ def test_manifest_cache_can_be_disabled_with_zero_size(monkeypatch: pytest.Monke
10611030 finally :
10621031 monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
10631032 importlib .reload (manifest_module )
1033+
1034+
1035+ def test_manifest_cache_respects_positive_env_size (monkeypatch : pytest .MonkeyPatch ) -> None :
1036+ """Test that a positive manifest-cache-size enables a bounded cache."""
1037+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "1" )
1038+ importlib .reload (manifest_module )
1039+
1040+ try :
1041+ assert manifest_module ._manifest_cache_size == 1
1042+
1043+ io = PyArrowFileIO ()
1044+
1045+ with TemporaryDirectory () as tmp_dir :
1046+ first_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "first" , snapshot_id = 1 )
1047+ second_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "second" , snapshot_id = 2 )
1048+
1049+ manifests_first_call = manifest_module ._manifests (io , first_list_path )
1050+ manifests_second_call = manifest_module ._manifests (io , first_list_path )
1051+
1052+ assert manifests_first_call [0 ] is manifests_second_call [0 ]
1053+ assert len (manifest_module ._manifest_cache ) == 1
1054+
1055+ manifest_module ._manifests (io , second_list_path )
1056+
1057+ assert len (manifest_module ._manifest_cache ) == 1
1058+ finally :
1059+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1060+ importlib .reload (manifest_module )
1061+
1062+
1063+ def test_manifest_cache_reads_size_from_configuration_file (monkeypatch : pytest .MonkeyPatch , tmp_path : Path ) -> None :
1064+ """Test that manifest-cache-size can be loaded from .pyiceberg.yaml."""
1065+ config_dir = tmp_path / "config"
1066+ config_dir .mkdir ()
1067+ (config_dir / ".pyiceberg.yaml" ).write_text ("manifest-cache-size: 2\n " , encoding = "utf-8" )
1068+
1069+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1070+ monkeypatch .setenv ("PYICEBERG_HOME" , str (config_dir ))
1071+ importlib .reload (manifest_module )
1072+
1073+ try :
1074+ assert manifest_module ._manifest_cache_size == 2
1075+
1076+ io = PyArrowFileIO ()
1077+
1078+ with TemporaryDirectory () as tmp_dir :
1079+ first_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "first" , snapshot_id = 1 )
1080+ second_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "second" , snapshot_id = 2 )
1081+ third_list_path = _create_test_manifest_list (manifest_module , io , tmp_dir , name = "third" , snapshot_id = 3 )
1082+
1083+ manifest_module ._manifests (io , first_list_path )
1084+ manifest_module ._manifests (io , second_list_path )
1085+ manifest_module ._manifests (io , third_list_path )
1086+
1087+ assert len (manifest_module ._manifest_cache ) == 2
1088+ finally :
1089+ monkeypatch .delenv ("PYICEBERG_HOME" , raising = False )
1090+ importlib .reload (manifest_module )
1091+
1092+
1093+ def test_invalid_manifest_cache_size_raises_value_error (monkeypatch : pytest .MonkeyPatch ) -> None :
1094+ """Test that invalid manifest-cache-size values raise a helpful error."""
1095+ monkeypatch .setenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , "not-an-int" )
1096+
1097+ try :
1098+ with pytest .raises (ValueError , match = "manifest-cache-size should be an integer or left unset" ):
1099+ importlib .reload (manifest_module )
1100+ finally :
1101+ monkeypatch .delenv ("PYICEBERG_MANIFEST_CACHE_SIZE" , raising = False )
1102+ importlib .reload (manifest_module )
0 commit comments