@@ -761,6 +761,125 @@ def extract_helper(self, item, path, hlm, *, dry_run=False):
761761 # In this case, we *want* to extract twice, because there is no other way.
762762 pass
763763
764+ def _fs_has_extended_acl (self , path , st ):
765+ """
766+ Return True if the filesystem object at *path* (with stat *st*) has a non-trivial
767+ (extended) ACL.
768+
769+ clear_attrs() deliberately does not reset ACLs. So if the existing file has an extended ACL that the archive item does not,
770+ an in-place update would leave that stale ACL behind. We detect this here and let such
771+ files take the normal extraction path (fresh inode) instead.
772+ """
773+ if is_win32 or self .noacls :
774+ return False
775+ probe = Item ()
776+ try :
777+ # acl_get only sets acl_* keys when there is a non-trivial ACL; for plain mode-only
778+ # permissions it sets nothing (it checks acl_extended_*() first on all platforms).
779+ acl_get (path , probe , st , numeric_ids = self .numeric_ids )
780+ except OSError :
781+ # if we cannot even read ACLs (e.g. unsupported by the fs), none can survive: safe.
782+ return False
783+ return any (key .startswith ("acl" ) for key in probe .as_dict ())
784+
785+ def can_patch_in_place (self , item , path , st ):
786+ """
787+ Can the existing filesystem object at *path* (described by *st*) be updated in place
788+ from *item* by only fetching the chunks that differ (see compare_and_extract_chunks)?
789+
790+ We only do this for plain regular files (not hard links): the destination must already
791+ exist as a regular file and the item must be a regular file, too. Hard links keep going
792+ through the normal extraction path so the preloading bookkeeping stays correct.
793+
794+ Finally, we skip files that carry an extended ACL, see _fs_has_extended_acl().
795+ """
796+ if st is None :
797+ return False
798+ if "hlid" in item :
799+ return False
800+ if not (stat .S_ISREG (st .st_mode ) and stat .S_ISREG (item .mode )):
801+ return False
802+ if st .st_nlink != 1 :
803+ return False
804+ return not self ._fs_has_extended_acl (path , st )
805+
806+ def will_patch_in_place (self , item ):
807+ """
808+ Like can_patch_in_place(), but stats the destination itself.
809+
810+ Used by the extract command to decide whether to skip preloading this item's chunks.
811+ """
812+ if "hlid" in item or not stat .S_ISREG (item .mode ):
813+ return False
814+ path = os .path .join (self .cwd , item .path )
815+ try :
816+ st = os .stat (path , follow_symlinks = False )
817+ except OSError :
818+ return False
819+ return self .can_patch_in_place (item , path , st )
820+
821+ def compare_and_extract_chunks (self , item , path , * , st , pi = None ):
822+ """
823+ Update the existing regular file at *path* in place from *item*, fetching only the
824+ chunks whose content differs from what is already on disk.
825+
826+ *st* is the stat result of the existing file as determined by the caller. Returns True
827+ if the file was updated in place, or False if the caller should fall back to a full
828+ extraction.
829+ """
830+ if not self .can_patch_in_place (item , path , st ):
831+ return False
832+
833+ # First pass (read-only): hash the existing on-disk content using the archived chunk
834+ # sizes, so we can compare it chunk-by-chunk with the archived chunk list.
835+ with backup_io ("open" ):
836+ fs_file = open (path , "rb+" )
837+ with fs_file :
838+ fs_chunks = []
839+ for item_chunk in item .chunks :
840+ with backup_io ("read" ):
841+ data = fs_file .read (item_chunk .size )
842+ fs_chunks .append (ChunkListEntry (id = self .key .id_hash (data ), size = len (data )))
843+
844+ # Only the chunks that actually differ need to be fetched from the repository.
845+ # These were not preloaded (see will_patch_in_place / the extract command), so we
846+ # fetch them as regular, non-preloaded objects.
847+ needed_chunks = [
848+ item_chunk for fs_chunk , item_chunk in zip (fs_chunks , item .chunks ) if fs_chunk != item_chunk
849+ ]
850+ fetched_chunks = self .pipeline .fetch_many (needed_chunks , ro_type = ROBJ_FILE_STREAM )
851+
852+ # Second pass: for each archived chunk, seek over the matching on-disk chunk or
853+ # overwrite the differing one with the freshly fetched data.
854+ with backup_io ("seek" ):
855+ fs_file .seek (0 )
856+ for fs_chunk , item_chunk in zip (fs_chunks , item .chunks ):
857+ if fs_chunk == item_chunk :
858+ with backup_io ("seek" ):
859+ fs_file .seek (item_chunk .size , os .SEEK_CUR )
860+ else :
861+ data = next (fetched_chunks )
862+ with backup_io ("write" ):
863+ fs_file .write (data )
864+ if pi :
865+ pi .show (increase = item_chunk .size , info = [remove_surrogates (item .path )])
866+
867+ with backup_io ("truncate_and_attrs" ):
868+ item_chunks_size = fs_file .tell ()
869+ fs_file .truncate (item_chunks_size )
870+ fs_file .flush ()
871+ fd = fs_file .fileno ()
872+ # the file existed before, so it may carry stale metadata that
873+ # restore_attrs() would not clear: wipe it first.
874+ self .clear_attrs (path , fd = fd )
875+ self .restore_attrs (path , item , fd = fd )
876+
877+ if "size" in item :
878+ item_size = item .size
879+ if item_size != item_chunks_size :
880+ raise BackupError (f"Size inconsistency detected: size { item_size } , chunks size { item_chunks_size } " )
881+ return True
882+
764883 def extract_item (
765884 self ,
766885 item ,
@@ -772,6 +891,7 @@ def extract_item(
772891 hlm = None ,
773892 pi = None ,
774893 continue_extraction = False ,
894+ preloaded = True ,
775895 ):
776896 """
777897 Extract archive item.
@@ -784,6 +904,9 @@ def extract_item(
784904 :param hlm: maps hlid to link_target for extracting subtrees with hard links correctly
785905 :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
786906 :param continue_extraction: continue a previously interrupted extraction of the same archive
907+ :param preloaded: whether this item's chunks were preloaded (see preload_item_chunks).
908+ Must be False if the caller skipped preloading (e.g. for in-place updates), so the
909+ full-extraction fetch does not wait for preloaded chunks that were never requested.
787910 """
788911
789912 def same_item (item , st ):
@@ -817,7 +940,9 @@ def same_item(item, st):
817940 # it would get stuck.
818941 if "chunks" in item :
819942 item_chunks_size = 0
820- for data in self .pipeline .fetch_many (item .chunks , is_preloaded = True , ro_type = ROBJ_FILE_STREAM ):
943+ for data in self .pipeline .fetch_many (
944+ item .chunks , is_preloaded = preloaded , ro_type = ROBJ_FILE_STREAM
945+ ):
821946 if pi :
822947 pi .show (increase = len (data ), info = [remove_surrogates (item .path )])
823948 if stdout :
@@ -837,13 +962,20 @@ def same_item(item, st):
837962
838963 dest = self .cwd
839964 path = os .path .join (dest , item .path )
965+ st = None # There is no file at path (or we could not stat it).
840966 # Attempt to remove existing files, ignore errors on failure
841967 try :
842968 st = os .stat (path , follow_symlinks = False )
843969 if continue_extraction and same_item (item , st ):
844970 return # done! we already have fully extracted this file in a previous run.
845- if not stat .S_ISDIR (st .st_mode ):
971+ if self .can_patch_in_place (item , path , st ):
972+ # keep the existing regular file in place so it can be updated by only
973+ # fetching the chunks that differ from what is already there.
974+ # compare_and_extract_chunks() will use this st.
975+ pass
976+ elif not stat .S_ISDIR (st .st_mode ):
846977 os .unlink (path )
978+ st = None
847979 elif stat .S_ISDIR (item .mode ):
848980 # if we have an existing directory and we want to extract a directory,
849981 # we just use the existing one and do not remove it.
@@ -852,10 +984,11 @@ def same_item(item, st):
852984 pass
853985 else :
854986 os .rmdir (path ) # only works for empty directories
987+ st = None
855988 except UnicodeEncodeError :
856989 raise self .IncompatibleFilesystemEncodingError (path , sys .getfilesystemencoding ()) from None
857990 except OSError :
858- pass
991+ st = None
859992
860993 def make_parent (path ):
861994 parent_dir = os .path .dirname (path )
@@ -869,11 +1002,13 @@ def make_parent(path):
8691002 with self .extract_helper (item , path , hlm ) as hardlink_set :
8701003 if hardlink_set :
8711004 return
1005+ if self .compare_and_extract_chunks (item , path , st = st , pi = pi ):
1006+ return
8721007 with backup_io ("open" ):
8731008 fd = open (path , "wb" )
8741009 with fd :
8751010 trailing_hole = False
876- for data in self .pipeline .fetch_many (item .chunks , is_preloaded = True , ro_type = ROBJ_FILE_STREAM ):
1011+ for data in self .pipeline .fetch_many (item .chunks , is_preloaded = preloaded , ro_type = ROBJ_FILE_STREAM ):
8771012 if pi :
8781013 pi .show (increase = len (data ), info = [remove_surrogates (item .path )])
8791014 with backup_io ("write" ):
@@ -1031,6 +1166,31 @@ def restore_attrs(self, path, item, symlink=False, fd=None):
10311166 # some systems don't support calling utime on a symlink
10321167 pass
10331168
1169+ def clear_attrs (self , path , fd = None ):
1170+ """
1171+ Remove pre-existing filesystem metadata (extended attributes and BSD-style flags) from
1172+ *path* (*fd*), bringing it to a "fresh" state for a subsequent restore_attrs().
1173+
1174+ restore_attrs() only *adds* the metadata stored in the archive item; it assumes a newly
1175+ created file. When updating an existing file in place (see compare_and_extract_chunks),
1176+ the file may carry xattrs or flags that are not present in the archive and that would
1177+ otherwise survive.
1178+
1179+ Note: ACLs are not cleared here. An access ACL present in the archive item is overwritten
1180+ by restore_attrs(); however, ACLs present only on the pre-existing file are not removed.
1181+ """
1182+ if is_win32 :
1183+ return
1184+ # Clear flags first: a leftover immutable/append flag from the existing file must not
1185+ # survive, and restore_attrs() only sets flags when the archive item carries them.
1186+ if not self .noflags :
1187+ try :
1188+ set_flags (path , 0 , fd = fd )
1189+ except OSError :
1190+ pass
1191+ if not self .noxattrs :
1192+ xattr .clear_all (fd if fd is not None else path , follow_symlinks = False )
1193+
10341194 def set_meta (self , key , value ):
10351195 metadata = self ._load_meta (self .id )
10361196 setattr (metadata , key , value )
0 commit comments