Skip to content

Commit cab3823

Browse files
committed
cache manifest, not tuple
1 parent a26fca7 commit cab3823

File tree

2 files changed

+333
-25
lines changed

2 files changed

+333
-25
lines changed

pyiceberg/manifest.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@
2828
Literal,
2929
)
3030

31-
from cachetools import LRUCache, cached
32-
from cachetools.keys import hashkey
31+
from cachetools import LRUCache
3332
from pydantic_core import to_json
3433

3534
from pyiceberg.avro.codecs import AVRO_CODEC_KEY, AvroCompressionCodec
@@ -892,15 +891,49 @@ def __hash__(self) -> int:
892891
return hash(self.manifest_path)
893892

894893

895-
# Global cache for manifest lists
896-
_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = LRUCache(maxsize=128)
894+
# Global cache for individual ManifestFile objects, keyed by manifest_path.
895+
# This avoids duplicating ManifestFile objects when multiple manifest lists
896+
# share the same manifests (which is common after appends).
897+
_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
898+
899+
# Lock for thread-safe cache access
900+
_manifest_cache_lock = threading.RLock()
897901

898902

899-
@cached(cache=_manifest_cache, key=lambda io, manifest_list: hashkey(manifest_list), lock=threading.RLock())
900903
def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
901-
"""Read and cache manifests from the given manifest list, returning a tuple to prevent modification."""
904+
"""Read manifests from the given manifest list, caching individual ManifestFile objects.
905+
906+
Unlike caching entire manifest lists, this approach caches individual ManifestFile
907+
objects by their manifest_path. This is more memory-efficient because:
908+
- ManifestList1 contains: (ManifestFile1)
909+
- ManifestList2 contains: (ManifestFile1, ManifestFile2)
910+
- ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)
911+
912+
With per-ManifestFile caching, ManifestFile1 is stored only once and reused,
913+
instead of being duplicated in each manifest list's cached tuple.
914+
915+
Args:
916+
io: The FileIO to read the manifest list.
917+
manifest_list: The path to the manifest list file.
918+
919+
Returns:
920+
A tuple of ManifestFile objects (tuple to prevent modification).
921+
"""
902922
file = io.new_input(manifest_list)
903-
return tuple(read_manifest_list(file))
923+
result = []
924+
925+
for manifest_file in read_manifest_list(file):
926+
manifest_path = manifest_file.manifest_path
927+
with _manifest_cache_lock:
928+
if manifest_path in _manifest_cache:
929+
# Reuse the cached ManifestFile object
930+
result.append(_manifest_cache[manifest_path])
931+
else:
932+
# Cache and use this ManifestFile
933+
_manifest_cache[manifest_path] = manifest_file
934+
result.append(manifest_file)
935+
936+
return tuple(result)
904937

905938

906939
def read_manifest_list(input_file: InputFile) -> Iterator[ManifestFile]:

0 commit comments

Comments
 (0)