@@ -891,48 +891,49 @@ def __hash__(self) -> int:
891891 return hash (self .manifest_path )
892892
893893
894- # Global cache for individual ManifestFile objects, keyed by manifest_path.
895- # This avoids duplicating ManifestFile objects when multiple manifest lists
896- # share the same manifests (which is common after appends) .
894+ # Global cache for ManifestFile objects, keyed by manifest_path.
895+ # This deduplicates ManifestFile objects across manifest lists, which commonly
896+ # share manifests after append operations .
897897_manifest_cache : LRUCache [str , ManifestFile ] = LRUCache (maxsize = 512 )
898898
899899# Lock for thread-safe cache access
900900_manifest_cache_lock = threading .RLock ()
901901
902902
903903def _manifests (io : FileIO , manifest_list : str ) -> tuple [ManifestFile , ...]:
904- """Read manifests from the given manifest list, caching individual ManifestFile objects.
904+ """Read manifests from a manifest list, deduplicating ManifestFile objects via cache .
905905
906- Unlike caching entire manifest lists, this approach caches individual ManifestFile
907- objects by their manifest_path. This is more memory-efficient because:
908- - ManifestList1 contains: (ManifestFile1)
909- - ManifestList2 contains: (ManifestFile1, ManifestFile2)
910- - ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)
906+ Caches individual ManifestFile objects by manifest_path. This is memory-efficient
907+ because consecutive manifest lists typically share most of their manifests:
911908
912- With per-ManifestFile caching, ManifestFile1 is stored only once and reused,
913- instead of being duplicated in each manifest list's cached tuple.
909+ ManifestList1: [ManifestFile1]
910+ ManifestList2: [ManifestFile1, ManifestFile2]
911+ ManifestList3: [ManifestFile1, ManifestFile2, ManifestFile3]
912+
913+ With per-ManifestFile caching, each ManifestFile is stored once and reused.
914+
915+ Note: The manifest list file is re-read on each call. This is intentional to
916+ keep the implementation simple and avoid O(N²) memory growth from caching
917+ overlapping manifest list tuples. Re-reading is cheap since manifest lists
918+ are small metadata files.
914919
915920 Args:
916- io: The FileIO to read the manifest list.
917- manifest_list: The path to the manifest list file.
921+ io: FileIO instance for reading the manifest list.
922+ manifest_list: Path to the manifest list file.
918923
919924 Returns:
920- A tuple of ManifestFile objects (tuple to prevent modification) .
925+ A tuple of ManifestFile objects.
921926 """
922- # Read manifest list outside the lock to avoid blocking other threads during I/O
923927 file = io .new_input (manifest_list )
924928 manifest_files = list (read_manifest_list (file ))
925929
926- # Only hold the lock while updating the cache
927930 result = []
928931 with _manifest_cache_lock :
929932 for manifest_file in manifest_files :
930933 manifest_path = manifest_file .manifest_path
931934 if manifest_path in _manifest_cache :
932- # Reuse the cached ManifestFile object
933935 result .append (_manifest_cache [manifest_path ])
934936 else :
935- # Cache and use this ManifestFile
936937 _manifest_cache [manifest_path ] = manifest_file
937938 result .append (manifest_file )
938939
0 commit comments