Skip to content

Commit 41f3562

Browse files
committed
Extract: Implement selective chunk extraction for regular files (#5638)
Signed-off-by: alighazi288 <51366992+alighazi288@users.noreply.github.com>
1 parent 93986b9 commit 41f3562

2 files changed

Lines changed: 173 additions & 5 deletions

File tree

src/borg/archive.py

Lines changed: 164 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,125 @@ def extract_helper(self, item, path, hlm, *, dry_run=False):
761761
# In this case, we *want* to extract twice, because there is no other way.
762762
pass
763763

764+
def _fs_has_extended_acl(self, path, st):
765+
"""
766+
Return True if the filesystem object at *path* (with stat *st*) has a non-trivial
767+
(extended) ACL.
768+
769+
clear_attrs() deliberately does not reset ACLs. So if the existing file has an extended ACL that the archive item does not,
770+
an in-place update would leave that stale ACL behind. We detect this here and let such
771+
files take the normal extraction path (fresh inode) instead.
772+
"""
773+
if is_win32 or self.noacls:
774+
return False
775+
probe = Item()
776+
try:
777+
# acl_get only sets acl_* keys when there is a non-trivial ACL; for plain mode-only
778+
# permissions it sets nothing (it checks acl_extended_*() first on all platforms).
779+
acl_get(path, probe, st, numeric_ids=self.numeric_ids)
780+
except OSError:
781+
# if we cannot even read ACLs (e.g. unsupported by the fs), none can survive: safe.
782+
return False
783+
return any(key.startswith("acl") for key in probe.as_dict())
784+
785+
def can_patch_in_place(self, item, path, st):
786+
"""
787+
Can the existing filesystem object at *path* (described by *st*) be updated in place
788+
from *item* by only fetching the chunks that differ (see compare_and_extract_chunks)?
789+
790+
We only do this for plain regular files (not hard links): the destination must already
791+
exist as a regular file and the item must be a regular file, too. Hard links keep going
792+
through the normal extraction path so the preloading bookkeeping stays correct.
793+
794+
Finally, we skip files that carry an extended ACL, see _fs_has_extended_acl().
795+
"""
796+
if st is None:
797+
return False
798+
if "hlid" in item:
799+
return False
800+
if not (stat.S_ISREG(st.st_mode) and stat.S_ISREG(item.mode)):
801+
return False
802+
if st.st_nlink != 1:
803+
return False
804+
return not self._fs_has_extended_acl(path, st)
805+
806+
def will_patch_in_place(self, item):
807+
"""
808+
Like can_patch_in_place(), but stats the destination itself.
809+
810+
Used by the extract command to decide whether to skip preloading this item's chunks.
811+
"""
812+
if "hlid" in item or not stat.S_ISREG(item.mode):
813+
return False
814+
path = os.path.join(self.cwd, item.path)
815+
try:
816+
st = os.stat(path, follow_symlinks=False)
817+
except OSError:
818+
return False
819+
return self.can_patch_in_place(item, path, st)
820+
821+
def compare_and_extract_chunks(self, item, path, *, st, pi=None):
822+
"""
823+
Update the existing regular file at *path* in place from *item*, fetching only the
824+
chunks whose content differs from what is already on disk.
825+
826+
*st* is the stat result of the existing file as determined by the caller. Returns True
827+
if the file was updated in place, or False if the caller should fall back to a full
828+
extraction.
829+
"""
830+
if not self.can_patch_in_place(item, path, st):
831+
return False
832+
833+
# First pass (read-only): hash the existing on-disk content using the archived chunk
834+
# sizes, so we can compare it chunk-by-chunk with the archived chunk list.
835+
with backup_io("open"):
836+
fs_file = open(path, "rb+")
837+
with fs_file:
838+
fs_chunks = []
839+
for item_chunk in item.chunks:
840+
with backup_io("read"):
841+
data = fs_file.read(item_chunk.size)
842+
fs_chunks.append(ChunkListEntry(id=self.key.id_hash(data), size=len(data)))
843+
844+
# Only the chunks that actually differ need to be fetched from the repository.
845+
# These were not preloaded (see will_patch_in_place / the extract command), so we
846+
# fetch them as regular, non-preloaded objects.
847+
needed_chunks = [
848+
item_chunk for fs_chunk, item_chunk in zip(fs_chunks, item.chunks) if fs_chunk != item_chunk
849+
]
850+
fetched_chunks = self.pipeline.fetch_many(needed_chunks, ro_type=ROBJ_FILE_STREAM)
851+
852+
# Second pass: for each archived chunk, seek over the matching on-disk chunk or
853+
# overwrite the differing one with the freshly fetched data.
854+
with backup_io("seek"):
855+
fs_file.seek(0)
856+
for fs_chunk, item_chunk in zip(fs_chunks, item.chunks):
857+
if fs_chunk == item_chunk:
858+
with backup_io("seek"):
859+
fs_file.seek(item_chunk.size, os.SEEK_CUR)
860+
else:
861+
data = next(fetched_chunks)
862+
with backup_io("write"):
863+
fs_file.write(data)
864+
if pi:
865+
pi.show(increase=item_chunk.size, info=[remove_surrogates(item.path)])
866+
867+
with backup_io("truncate_and_attrs"):
868+
item_chunks_size = fs_file.tell()
869+
fs_file.truncate(item_chunks_size)
870+
fs_file.flush()
871+
fd = fs_file.fileno()
872+
# the file existed before, so it may carry stale metadata that
873+
# restore_attrs() would not clear: wipe it first.
874+
self.clear_attrs(path, fd=fd)
875+
self.restore_attrs(path, item, fd=fd)
876+
877+
if "size" in item:
878+
item_size = item.size
879+
if item_size != item_chunks_size:
880+
raise BackupError(f"Size inconsistency detected: size {item_size}, chunks size {item_chunks_size}")
881+
return True
882+
764883
def extract_item(
765884
self,
766885
item,
@@ -772,6 +891,7 @@ def extract_item(
772891
hlm=None,
773892
pi=None,
774893
continue_extraction=False,
894+
preloaded=True,
775895
):
776896
"""
777897
Extract archive item.
@@ -784,6 +904,9 @@ def extract_item(
784904
:param hlm: maps hlid to link_target for extracting subtrees with hard links correctly
785905
:param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
786906
:param continue_extraction: continue a previously interrupted extraction of the same archive
907+
:param preloaded: whether this item's chunks were preloaded (see preload_item_chunks).
908+
Must be False if the caller skipped preloading (e.g. for in-place updates), so the
909+
full-extraction fetch does not wait for preloaded chunks that were never requested.
787910
"""
788911

789912
def same_item(item, st):
@@ -817,7 +940,9 @@ def same_item(item, st):
817940
# it would get stuck.
818941
if "chunks" in item:
819942
item_chunks_size = 0
820-
for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
943+
for data in self.pipeline.fetch_many(
944+
item.chunks, is_preloaded=preloaded, ro_type=ROBJ_FILE_STREAM
945+
):
821946
if pi:
822947
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
823948
if stdout:
@@ -837,13 +962,20 @@ def same_item(item, st):
837962

838963
dest = self.cwd
839964
path = os.path.join(dest, item.path)
965+
st = None # There is no file at path (or we could not stat it).
840966
# Attempt to remove existing files, ignore errors on failure
841967
try:
842968
st = os.stat(path, follow_symlinks=False)
843969
if continue_extraction and same_item(item, st):
844970
return # done! we already have fully extracted this file in a previous run.
845-
if not stat.S_ISDIR(st.st_mode):
971+
if self.can_patch_in_place(item, path, st):
972+
# keep the existing regular file in place so it can be updated by only
973+
# fetching the chunks that differ from what is already there.
974+
# compare_and_extract_chunks() will use this st.
975+
pass
976+
elif not stat.S_ISDIR(st.st_mode):
846977
os.unlink(path)
978+
st = None
847979
elif stat.S_ISDIR(item.mode):
848980
# if we have an existing directory and we want to extract a directory,
849981
# we just use the existing one and do not remove it.
@@ -852,10 +984,11 @@ def same_item(item, st):
852984
pass
853985
else:
854986
os.rmdir(path) # only works for empty directories
987+
st = None
855988
except UnicodeEncodeError:
856989
raise self.IncompatibleFilesystemEncodingError(path, sys.getfilesystemencoding()) from None
857990
except OSError:
858-
pass
991+
st = None
859992

860993
def make_parent(path):
861994
parent_dir = os.path.dirname(path)
@@ -869,11 +1002,13 @@ def make_parent(path):
8691002
with self.extract_helper(item, path, hlm) as hardlink_set:
8701003
if hardlink_set:
8711004
return
1005+
if self.compare_and_extract_chunks(item, path, st=st, pi=pi):
1006+
return
8721007
with backup_io("open"):
8731008
fd = open(path, "wb")
8741009
with fd:
8751010
trailing_hole = False
876-
for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
1011+
for data in self.pipeline.fetch_many(item.chunks, is_preloaded=preloaded, ro_type=ROBJ_FILE_STREAM):
8771012
if pi:
8781013
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
8791014
with backup_io("write"):
@@ -1031,6 +1166,31 @@ def restore_attrs(self, path, item, symlink=False, fd=None):
10311166
# some systems don't support calling utime on a symlink
10321167
pass
10331168

1169+
def clear_attrs(self, path, fd=None):
1170+
"""
1171+
Remove pre-existing filesystem metadata (extended attributes and BSD-style flags) from
1172+
*path* (*fd*), bringing it to a "fresh" state for a subsequent restore_attrs().
1173+
1174+
restore_attrs() only *adds* the metadata stored in the archive item; it assumes a newly
1175+
created file. When updating an existing file in place (see compare_and_extract_chunks),
1176+
the file may carry xattrs or flags that are not present in the archive and that would
1177+
otherwise survive.
1178+
1179+
Note: ACLs are not cleared here. An access ACL present in the archive item is overwritten
1180+
by restore_attrs(); however, ACLs present only on the pre-existing file are not removed.
1181+
"""
1182+
if is_win32:
1183+
return
1184+
# Clear flags first: a leftover immutable/append flag from the existing file must not
1185+
# survive, and restore_attrs() only sets flags when the archive item carries them.
1186+
if not self.noflags:
1187+
try:
1188+
set_flags(path, 0, fd=fd)
1189+
except OSError:
1190+
pass
1191+
if not self.noxattrs:
1192+
xattr.clear_all(fd if fd is not None else path, follow_symlinks=False)
1193+
10341194
def set_meta(self, key, value):
10351195
metadata = self._load_meta(self.id)
10361196
setattr(metadata, key, value)

src/borg/archiver/extract_cmd.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,14 @@ def do_extract(self, args, repository, manifest, archive):
7272
logging.getLogger("borg.output.list").info(f"{log_prefix} {remove_surrogates(item.path)}")
7373

7474
if is_matched:
75-
archive.preload_item_chunks(item, optimize_hardlinks=True)
75+
# Skip preloading when we will update an existing regular file in place:
76+
# that path only fetches the chunks that differ, so preloading all of them
77+
# would leak the unfetched ones in the RemoteRepository (see preload_item_chunks).
78+
preloaded = True
79+
if not dry_run and not stdout and archive.will_patch_in_place(item):
80+
preloaded = False
81+
else:
82+
archive.preload_item_chunks(item, optimize_hardlinks=True)
7683

7784
if not dry_run:
7885
while dirs and not item.path.startswith(dirs[-1].path):
@@ -97,6 +104,7 @@ def do_extract(self, args, repository, manifest, archive):
97104
hlm=hlm,
98105
pi=pi,
99106
continue_extraction=continue_extraction,
107+
preloaded=preloaded,
100108
)
101109
except BackupError as e:
102110
self.print_warning_instance(BackupWarning(remove_surrogates(orig_path), e))

0 commit comments

Comments
 (0)