1616# under the License.
1717# pylint: disable=redefined-outer-name,arguments-renamed,fixme
1818from tempfile import TemporaryDirectory
19+ from unittest import mock
1920
2021import fastavro
2122import pytest
2223
24+ import pyiceberg .manifest as manifest_module
2325from pyiceberg .avro .codecs import AvroCompressionCodec
2426from pyiceberg .io import load_file_io
2527from pyiceberg .io .pyarrow import PyArrowFileIO
3234 ManifestEntryStatus ,
3335 ManifestFile ,
3436 PartitionFieldSummary ,
35- _manifest_cache ,
37+ _get_manifest_cache ,
3638 _manifests ,
39+ clear_manifest_cache ,
3740 read_manifest_list ,
3841 write_manifest ,
3942 write_manifest_list ,
4649
4750
4851@pytest .fixture (autouse = True )
49- def clear_global_manifests_cache () -> None :
50- # Clear the global cache before each test
51- _manifest_cache .clear ()
52+ def reset_global_manifests_cache () -> None :
53+ # Reset cache state before each test so config is re-read
54+ manifest_module ._manifest_cache_manager ._cache = None
55+ manifest_module ._manifest_cache_manager ._initialized = False
5256
5357
5458def _verify_metadata_with_fastavro (avro_file : str , expected_metadata : dict [str , str ]) -> None :
@@ -804,9 +808,9 @@ def test_manifest_cache_deduplicates_manifest_files() -> None:
804808
805809 # Verify cache size - should only have 3 unique ManifestFile objects
806810 # instead of 1 + 2 + 3 = 6 objects as with the old approach
807- assert len ( _manifest_cache ) == 3 , (
808- f"Cache should contain exactly 3 unique ManifestFile objects, but has { len ( _manifest_cache ) } "
809- )
811+ cache = _get_manifest_cache ()
812+ assert cache is not None , "Manifest cache should be enabled for this test "
813+ assert len ( cache ) == 3 , f"Cache should contain exactly 3 unique ManifestFile objects, but has { len ( cache ) } "
810814
811815
812816def test_manifest_cache_efficiency_with_many_overlapping_lists () -> None :
@@ -879,9 +883,11 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
879883 # With the new approach, we should have exactly N objects
880884
881885 # Verify cache has exactly N unique entries
882- assert len (_manifest_cache ) == num_manifests , (
886+ cache = _get_manifest_cache ()
887+ assert cache is not None , "Manifest cache should be enabled for this test"
888+ assert len (cache ) == num_manifests , (
883889 f"Cache should contain exactly { num_manifests } ManifestFile objects, "
884- f"but has { len (_manifest_cache )} . "
890+ f"but has { len (cache )} . "
885891 f"Old approach would have { num_manifests * (num_manifests + 1 ) // 2 } objects."
886892 )
887893
@@ -897,3 +903,117 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
897903 if len (references ) > 1 :
898904 for ref in references [1 :]:
899905 assert ref is references [0 ], f"All references to manifest { i } should be the same object instance"
906+
907+
908+ def test_clear_manifest_cache () -> None :
909+ """Test that clear_manifest_cache() clears cache entries while keeping cache enabled."""
910+ io = PyArrowFileIO ()
911+
912+ with TemporaryDirectory () as tmp_dir :
913+ schema = Schema (NestedField (field_id = 1 , name = "id" , field_type = IntegerType (), required = True ))
914+ spec = UNPARTITIONED_PARTITION_SPEC
915+
916+ # Create a manifest file
917+ manifest_path = f"{ tmp_dir } /manifest.avro"
918+ with write_manifest (
919+ format_version = 2 ,
920+ spec = spec ,
921+ schema = schema ,
922+ output_file = io .new_output (manifest_path ),
923+ snapshot_id = 1 ,
924+ avro_compression = "zstandard" ,
925+ ) as writer :
926+ data_file = DataFile .from_args (
927+ content = DataFileContent .DATA ,
928+ file_path = f"{ tmp_dir } /data.parquet" ,
929+ file_format = FileFormat .PARQUET ,
930+ partition = Record (),
931+ record_count = 100 ,
932+ file_size_in_bytes = 1000 ,
933+ )
934+ writer .add_entry (
935+ ManifestEntry .from_args (
936+ status = ManifestEntryStatus .ADDED ,
937+ snapshot_id = 1 ,
938+ data_file = data_file ,
939+ )
940+ )
941+ manifest_file = writer .to_manifest_file ()
942+
943+ # Create a manifest list
944+ list_path = f"{ tmp_dir } /manifest-list.avro"
945+ with write_manifest_list (
946+ format_version = 2 ,
947+ output_file = io .new_output (list_path ),
948+ snapshot_id = 1 ,
949+ parent_snapshot_id = None ,
950+ sequence_number = 1 ,
951+ avro_compression = "zstandard" ,
952+ ) as list_writer :
953+ list_writer .add_manifests ([manifest_file ])
954+
955+ # Populate the cache
956+ _manifests (io , list_path )
957+
958+ # Verify cache has entries
959+ cache = _get_manifest_cache ()
960+ assert cache is not None , "Cache should be enabled"
961+ assert len (cache ) > 0 , "Cache should have entries after reading manifests"
962+
963+ # Clear the cache
964+ clear_manifest_cache ()
965+
966+ # Verify cache is empty but still enabled
967+ cache_after = _get_manifest_cache ()
968+ assert cache_after is not None , "Cache should still be enabled after clear"
969+ assert len (cache_after ) == 0 , "Cache should be empty after clear"
970+
971+
972+ @pytest .mark .parametrize (
973+ "env_vars,expected_enabled,expected_size" ,
974+ [
975+ ({}, True , 128 ), # defaults
976+ ({"PYICEBERG_MANIFEST__CACHE__ENABLED" : "true" }, True , 128 ),
977+ ({"PYICEBERG_MANIFEST__CACHE__ENABLED" : "false" }, False , 128 ),
978+ ({"PYICEBERG_MANIFEST__CACHE__SIZE" : "64" }, True , 64 ),
979+ ({"PYICEBERG_MANIFEST__CACHE__SIZE" : "256" }, True , 256 ),
980+ ({"PYICEBERG_MANIFEST__CACHE__ENABLED" : "false" , "PYICEBERG_MANIFEST__CACHE__SIZE" : "64" }, False , 64 ),
981+ ],
982+ )
983+ def test_manifest_cache_config_valid_values (env_vars : dict [str , str ], expected_enabled : bool , expected_size : int ) -> None :
984+ """Test that valid config values are applied correctly."""
985+ import os
986+
987+ with mock .patch .dict (os .environ , env_vars , clear = False ):
988+ # Reset cache state so config is re-read
989+ manifest_module ._manifest_cache_manager ._cache = None
990+ manifest_module ._manifest_cache_manager ._initialized = False
991+ cache = _get_manifest_cache ()
992+
993+ if expected_enabled :
994+ assert cache is not None , "Cache should be enabled"
995+ assert cache .maxsize == expected_size , f"Cache size should be { expected_size } "
996+ else :
997+ assert cache is None , "Cache should be disabled"
998+
999+
1000+ @pytest .mark .parametrize (
1001+ "env_vars,expected_error_substring" ,
1002+ [
1003+ ({"PYICEBERG_MANIFEST__CACHE__ENABLED" : "maybe" }, "manifest.cache.enabled should be a boolean" ),
1004+ ({"PYICEBERG_MANIFEST__CACHE__ENABLED" : "invalid" }, "manifest.cache.enabled should be a boolean" ),
1005+ ({"PYICEBERG_MANIFEST__CACHE__SIZE" : "abc" }, "manifest.cache.size should be a positive integer" ),
1006+ ({"PYICEBERG_MANIFEST__CACHE__SIZE" : "0" }, "manifest.cache.size must be >= 1" ),
1007+ ({"PYICEBERG_MANIFEST__CACHE__SIZE" : "-5" }, "manifest.cache.size must be >= 1" ),
1008+ ],
1009+ )
1010+ def test_manifest_cache_config_invalid_values (env_vars : dict [str , str ], expected_error_substring : str ) -> None :
1011+ """Test that invalid config values raise ValueError with appropriate message."""
1012+ import os
1013+
1014+ with mock .patch .dict (os .environ , env_vars , clear = False ):
1015+ # Reset cache state so config is re-read
1016+ manifest_module ._manifest_cache_manager ._cache = None
1017+ manifest_module ._manifest_cache_manager ._initialized = False
1018+ with pytest .raises (ValueError , match = expected_error_substring ):
1019+ _get_manifest_cache ()
0 commit comments