Skip to content

Commit 840de26

Browse files
Merge pull request #9798 from mr-raj12/pack-files-remove-objects-helper
repository: add replace_pack helper
2 parents 17c3d67 + bf36090 commit 840de26

2 files changed

Lines changed: 130 additions & 2 deletions

File tree

src/borg/repository.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,7 @@ def store_list(namespace):
695695
logger.error("Repository index is corrupted and must be repaired; skipping the pack check.")
696696
objs_errors = index_errors + pack_errors
697697
logger.info(
698-
f"Checked {index_files} index files ({index_errors} errors) "
699-
f"and {pack_files} packs ({pack_errors} errors)."
698+
f"Checked {index_files} index files ({index_errors} errors) and {pack_files} packs ({pack_errors} errors)."
700699
)
701700
if objs_errors == 0:
702701
logger.info(f"Finished {mode} repository check, no problems found.")
@@ -811,6 +810,70 @@ def delete(self, id):
811810
raise self.ObjectNotFound(id, str(self._location))
812811
logger.warning("ignoring deletion of %s in %s", bin_to_hex(id), bin_to_hex(entry.pack_id))
813812

813+
def compact_pack(self, pack_id, *, keep_ids: set, drop_ids: set):
814+
"""Rewrite pack <pack_id>, keeping <keep_ids> and dropping <drop_ids>, then delete the old pack.
815+
816+
keep_ids and drop_ids are sets of chunk ids that must together cover the whole pack (asserted:
817+
their ranges tile it with no gap or overlap, and their intersection is empty). Kept objects are
818+
copied into a new pack via store.defrag and repointed in the chunk index; dropped objects' index
819+
entries are removed.
820+
821+
Returns the new pack_id, None if nothing is kept (pack dropped), or <pack_id> unchanged if the
822+
kept objects reproduce the old pack (same sha256 name, nothing to delete).
823+
824+
Updates the in-memory chunk index only. The caller holds the exclusive lock and owns index
825+
durability: invalidate the cached index before calling, write it back after, as compact does.
826+
"""
827+
self._lock_refresh()
828+
pack_key = "packs/" + bin_to_hex(pack_id)
829+
830+
assert keep_ids & drop_ids == set(), "an id cannot appear in both keep_ids and drop_ids"
831+
832+
# collect every object's range, tagged with whether it is kept, ordered by offset.
833+
located = [] # (obj_offset, obj_id, obj_size, keep)
834+
for obj_id in keep_ids | drop_ids:
835+
keep = obj_id in keep_ids
836+
entry = self.chunks[obj_id]
837+
assert entry.pack_id == pack_id, f"{bin_to_hex(obj_id)} is not in pack {bin_to_hex(pack_id)}"
838+
located.append((entry.obj_offset, obj_id, entry.obj_size, keep))
839+
located.sort()
840+
841+
# keep + drop must tile the whole pack; collect the objects to keep in the same pass.
842+
kept = [] # (obj_offset, obj_id, obj_size), offset-ordered
843+
covered = 0
844+
for offset, obj_id, size, keep in located:
845+
assert offset == covered, f"gap or overlap in pack {bin_to_hex(pack_id)} at offset {covered}"
846+
covered += size
847+
if keep:
848+
kept.append((offset, obj_id, size))
849+
assert covered == self.store.info(pack_key).size, f"pack {bin_to_hex(pack_id)} not fully covered"
850+
851+
for drop_id in drop_ids: # remove dropped objects from the index; their bytes are not copied forward
852+
del self.chunks[drop_id]
853+
854+
if not kept: # nothing kept: drop the pack, no replacement
855+
self.store_delete(pack_key)
856+
return None
857+
858+
# copy kept objects into a new pack (named sha256 of its content)
859+
sources = [(bin_to_hex(pack_id), offset, size) for offset, _, size in kept]
860+
new_pack_id = hex_to_bin(self.store.defrag(sources, algorithm="sha256", namespace="packs"))
861+
862+
# repoint kept objects at the new pack; new offset is the running sum of kept sizes
863+
new_locations = []
864+
offset = 0
865+
for _, keep_id, size in kept:
866+
new_locations.append((keep_id, new_pack_id, offset, size))
867+
offset += size
868+
self.chunks.update_pack_info(new_locations)
869+
870+
# delete the old pack last, after the new one is stored and indexed, so kept bytes are never the
871+
# only copy. if every object was kept in order, defrag reproduced the pack (new_pack_id == pack_id)
872+
# and deleting it would drop what we kept, so skip.
873+
if new_pack_id != pack_id:
874+
self.store_delete(pack_key)
875+
return new_pack_id
876+
814877
def break_lock(self):
815878
Lock(self.store).break_lock()
816879

src/borg/testsuite/repository_test.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,71 @@ def test_consistency(repo_fixtures, request):
142142
assert pdchunk(repository.get(H(0))) == b"bar"
143143

144144

145+
def build_one_pack(repository, objects):
146+
with repository:
147+
repository._pack_writer.max_count = len(objects) + 1 # prevent per-put flush; one pack on flush()
148+
for chunk_id, chunk in objects:
149+
repository.put(chunk_id, chunk)
150+
repository.flush()
151+
152+
153+
def test_compact_pack_copy_forward(repo_fixtures, request):
154+
# Keep a subset of a multi-object pack: survivors must read back, the dropped object and its bytes gone.
155+
chunk0 = fchunk(b"DATA0", chunk_id=H(0))
156+
chunk1 = fchunk(b"DATA1", chunk_id=H(1))
157+
chunk2 = fchunk(b"DATA2", chunk_id=H(2))
158+
repository = get_repository_from_fixture(repo_fixtures, request)
159+
build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1), (H(2), chunk2)])
160+
with repository:
161+
old_pack_id = repository.chunks[H(0)].pack_id
162+
assert repository.chunks[H(1)].pack_id == old_pack_id
163+
assert repository.chunks[H(2)].pack_id == old_pack_id
164+
165+
new_pack_id = repository.compact_pack(old_pack_id, keep_ids={H(0), H(2)}, drop_ids={H(1)})
166+
167+
assert new_pack_id is not None and new_pack_id != old_pack_id
168+
assert pdchunk(repository.get(H(0))) == b"DATA0"
169+
assert pdchunk(repository.get(H(2))) == b"DATA2"
170+
assert repository.get(H(1), raise_missing=False) is None # compact_pack removed its index entry
171+
packs = {info.name: info.size for info in repository.store_list("packs")}
172+
assert bin_to_hex(old_pack_id) not in packs
173+
assert packs[bin_to_hex(new_pack_id)] == len(chunk0) + len(chunk2) # only the kept objects' bytes
174+
175+
176+
def test_compact_pack_drops_whole_pack(repo_fixtures, request):
177+
# Dropping every object removes the pack and clears its index entries.
178+
chunk0 = fchunk(b"DATA0", chunk_id=H(0))
179+
chunk1 = fchunk(b"DATA1", chunk_id=H(1))
180+
repository = get_repository_from_fixture(repo_fixtures, request)
181+
build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)])
182+
with repository:
183+
old_pack_id = repository.chunks[H(0)].pack_id
184+
185+
assert repository.compact_pack(old_pack_id, keep_ids=set(), drop_ids={H(0), H(1)}) is None
186+
187+
assert repository.get(H(0), raise_missing=False) is None
188+
assert repository.get(H(1), raise_missing=False) is None
189+
assert bin_to_hex(old_pack_id) not in [info.name for info in repository.store_list("packs")]
190+
191+
192+
def test_compact_pack_keep_all_is_noop(repo_fixtures, request):
193+
# Keeping every object reproduces the same pack: same sha256 name, old pack not deleted. Ids passed
194+
# out of order must give the same result, since compact_pack sorts by offset.
195+
chunk0 = fchunk(b"DATA0", chunk_id=H(0))
196+
chunk1 = fchunk(b"DATA1", chunk_id=H(1))
197+
repository = get_repository_from_fixture(repo_fixtures, request)
198+
build_one_pack(repository, [(H(0), chunk0), (H(1), chunk1)])
199+
with repository:
200+
old_pack_id = repository.chunks[H(0)].pack_id
201+
202+
new_pack_id = repository.compact_pack(old_pack_id, keep_ids={H(1), H(0)}, drop_ids=set()) # out of order
203+
204+
assert new_pack_id == old_pack_id
205+
assert pdchunk(repository.get(H(0))) == b"DATA0"
206+
assert pdchunk(repository.get(H(1))) == b"DATA1"
207+
assert bin_to_hex(old_pack_id) in [info.name for info in repository.store_list("packs")]
208+
209+
145210
def test_list(repo_fixtures, request):
146211
with get_repository_from_fixture(repo_fixtures, request) as repository:
147212
for x in range(100):

0 commit comments

Comments
 (0)