Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 40 additions & 7 deletions pyiceberg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
Literal,
)

from cachetools import LRUCache, cached
from cachetools.keys import hashkey
from cachetools import LRUCache
from pydantic_core import to_json

from pyiceberg.avro.codecs import AVRO_CODEC_KEY, AvroCompressionCodec
Expand Down Expand Up @@ -892,15 +891,49 @@ def __hash__(self) -> int:
return hash(self.manifest_path)


# Global cache for manifest lists
_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = LRUCache(maxsize=128)
# Global cache for individual ManifestFile objects, keyed by manifest_path.
# This avoids duplicating ManifestFile objects when multiple manifest lists
# share the same manifests (which is common after appends).
_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why bump this up from 128 -> 512 (it's okay to say it's arbitrary)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch. now that we're only caching ManifestFile objects, they have relatively small memory footprint. we were catching manifest list before, each pointing to many many ManifestFiles

also #2952 should make this configurable

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The target size of a Manifest is 8MB. 512*8MB=4GB, which seems high. Should we keep this at 128 (1GB) until we make this configurable?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds reasonable! lets do it


# Lock for thread-safe cache access
_manifest_cache_lock = threading.RLock()


@cached(cache=_manifest_cache, key=lambda io, manifest_list: hashkey(manifest_list), lock=threading.RLock())
def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
"""Read and cache manifests from the given manifest list, returning a tuple to prevent modification."""
"""Read manifests from the given manifest list, caching individual ManifestFile objects.

Unlike caching entire manifest lists, this approach caches individual ManifestFile
objects by their manifest_path. This is more memory-efficient because:
- ManifestList1 contains: (ManifestFile1)
- ManifestList2 contains: (ManifestFile1, ManifestFile2)
- ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)

With per-ManifestFile caching, ManifestFile1 is stored only once and reused,
instead of being duplicated in each manifest list's cached tuple.

Args:
io: The FileIO to read the manifest list.
manifest_list: The path to the manifest list file.

Returns:
A tuple of ManifestFile objects (tuple to prevent modification).
"""
file = io.new_input(manifest_list)
return tuple(read_manifest_list(file))
result = []

for manifest_file in read_manifest_list(file):
manifest_path = manifest_file.manifest_path
with _manifest_cache_lock:
if manifest_path in _manifest_cache:
# Reuse the cached ManifestFile object
result.append(_manifest_cache[manifest_path])
else:
# Cache and use this ManifestFile
_manifest_cache[manifest_path] = manifest_file
result.append(manifest_file)

return tuple(result)


def read_manifest_list(input_file: InputFile) -> Iterator[ManifestFile]:
Expand Down
Loading