|
28 | 28 | Literal, |
29 | 29 | ) |
30 | 30 |
|
31 | | -from cachetools import LRUCache, cached |
32 | | -from cachetools.keys import hashkey |
| 31 | +from cachetools import LRUCache |
33 | 32 | from pydantic_core import to_json |
34 | 33 |
|
35 | 34 | from pyiceberg.avro.codecs import AVRO_CODEC_KEY, AvroCompressionCodec |
@@ -892,15 +891,49 @@ def __hash__(self) -> int: |
892 | 891 | return hash(self.manifest_path) |
893 | 892 |
|
894 | 893 |
|
895 | | -# Global cache for manifest lists |
896 | | -_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = LRUCache(maxsize=128) |
| 894 | +# Global cache for individual ManifestFile objects, keyed by manifest_path. |
| 895 | +# This avoids duplicating ManifestFile objects when multiple manifest lists |
| 896 | +# share the same manifests (which is common after appends). |
| 897 | +_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512) |
| 898 | + |
| 899 | +# Lock for thread-safe cache access |
| 900 | +_manifest_cache_lock = threading.RLock() |
897 | 901 |
|
898 | 902 |
|
899 | | -@cached(cache=_manifest_cache, key=lambda io, manifest_list: hashkey(manifest_list), lock=threading.RLock()) |
900 | 903 | def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]: |
901 | | - """Read and cache manifests from the given manifest list, returning a tuple to prevent modification.""" |
| 904 | + """Read manifests from the given manifest list, caching individual ManifestFile objects. |
| 905 | +
|
| 906 | + Unlike caching entire manifest lists, this approach caches individual ManifestFile |
| 907 | + objects by their manifest_path. This is more memory-efficient because: |
| 908 | + - ManifestList1 contains: (ManifestFile1) |
| 909 | + - ManifestList2 contains: (ManifestFile1, ManifestFile2) |
| 910 | + - ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3) |
| 911 | +
|
| 912 | + With per-ManifestFile caching, ManifestFile1 is stored only once and reused, |
| 913 | + instead of being duplicated in each manifest list's cached tuple. |
| 914 | +
|
| 915 | + Args: |
| 916 | + io: The FileIO to read the manifest list. |
| 917 | + manifest_list: The path to the manifest list file. |
| 918 | +
|
| 919 | + Returns: |
| 920 | + A tuple of ManifestFile objects (tuple to prevent modification). |
| 921 | + """ |
902 | 922 | file = io.new_input(manifest_list) |
903 | | - return tuple(read_manifest_list(file)) |
| 923 | + result = [] |
| 924 | + |
| 925 | + for manifest_file in read_manifest_list(file): |
| 926 | + manifest_path = manifest_file.manifest_path |
| 927 | + with _manifest_cache_lock: |
| 928 | + if manifest_path in _manifest_cache: |
| 929 | + # Reuse the cached ManifestFile object |
| 930 | + result.append(_manifest_cache[manifest_path]) |
| 931 | + else: |
| 932 | + # Cache and use this ManifestFile |
| 933 | + _manifest_cache[manifest_path] = manifest_file |
| 934 | + result.append(manifest_file) |
| 935 | + |
| 936 | + return tuple(result) |
904 | 937 |
|
905 | 938 |
|
906 | 939 | def read_manifest_list(input_file: InputFile) -> Iterator[ManifestFile]: |
|
0 commit comments