-
-
Notifications
You must be signed in to change notification settings - Fork 859
pack-files: never drop a whole pack to delete a single object #9813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
9fff040
82c589f
5747a19
e74d7ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -736,10 +736,10 @@ def get(self, id, read_data=True, raise_missing=True): | |
| raise self.ObjectNotFound(id, str(self._location)) | ||
| return None | ||
| if entry.pack_id == UNKNOWN_BYTES32: | ||
| # chunk is buffered in PackWriter, not yet flushed to a pack. at N=1 put() flushes | ||
| # immediately, so reaching here points at a flush / index-update ordering bug, not a | ||
| # genuinely missing object. this is a code bug, so we crash loudly regardless of | ||
| # raise_missing instead of pretending the object is absent. | ||
| # chunk is buffered in PackWriter, not yet flushed to a pack. Everything must be flushed | ||
| # before it can be read back, so reaching here points at a flush / index-update ordering | ||
| # bug, not a genuinely missing object. this is a code bug, so we crash loudly regardless | ||
| # of raise_missing instead of pretending the object is absent. | ||
| raise self.PackLocationUnknown(id, str(self._location)) | ||
| pack_id, obj_offset, obj_size = entry.pack_id, entry.obj_offset, entry.obj_size | ||
| id_hex = bin_to_hex(id) | ||
|
|
@@ -753,10 +753,10 @@ def get(self, id, read_data=True, raise_missing=True): | |
| hdr_size = RepoObj.obj_header.size | ||
| extra_size = 1024 - hdr_size # load a bit more, 1024b, reduces round trips | ||
| load_size = hdr_size + extra_size | ||
| # keep the read inside this object: at N>1 a pack holds neighbouring objects, so | ||
| # don't pull bytes past obj_size into the next one. (an overshoot would be harmless | ||
| # -- parse_meta uses the header's length and ignores trailing bytes -- this is just | ||
| # tidy.) obj_size comes from the same index we already route with. | ||
| # keep the read inside this object: a pack holds neighbouring objects, so don't pull | ||
| # bytes past obj_size into the next one. (an overshoot would be harmless -- parse_meta | ||
| # uses the header's length and ignores trailing bytes -- this is just tidy.) obj_size | ||
| # comes from the same index we already route with. | ||
| load_size = min(load_size, obj_size) | ||
| obj = self.store.load(key, offset=obj_offset, size=load_size) | ||
| hdr = obj[0:hdr_size] | ||
|
|
@@ -787,9 +787,9 @@ def get_many(self, ids, read_data=True, raise_missing=True): | |
| def put(self, id, data): | ||
| """put a repo object | ||
|
|
||
| Returns a list of (chunk_id, pack_id, obj_offset, obj_size) tuples for | ||
| every chunk written to disk this call. At max_count=1 this is always | ||
| one entry. | ||
| Buffers the chunk in the pack writer. When the chunk fills the pack and | ||
| triggers a flush, returns a list of (chunk_id, pack_id, obj_offset, obj_size) | ||
| tuples, one per chunk written to disk by that flush; otherwise returns None. | ||
| """ | ||
| self._lock_refresh() | ||
| data_size = len(data) | ||
|
|
@@ -799,22 +799,39 @@ def put(self, id, data): | |
| return self._pack_writer.add(id, data) | ||
|
|
||
| def delete(self, id): | ||
| """delete a repo object""" | ||
| """Delete a single repo object by rewriting its pack without it. | ||
|
|
||
| A pack holds several objects, so we can not just drop the whole pack: that would take the | ||
| target's innocent neighbours with it. Route through compact_pack, which copies the survivors | ||
| into a new pack and repoints them in the chunk index. Rewriting a whole pack to remove one | ||
| object is slow, but delete is only used by special-purpose callers (tests, the debug command, | ||
| check --repair), never on a hot path, so the cost does not matter. | ||
| """ | ||
| self._lock_refresh() | ||
| # We can not remove one object by dropping its whole pack without losing the pack's other | ||
| # objects; real removal is store_delete at the pack level (compact). For now just check the | ||
| # object exists (ObjectNotFound contract), log, and do nothing. | ||
| # TODO: delete a single object once a pack can hold more than one (N>1). | ||
| from .cache import write_chunkindex_to_repo | ||
|
|
||
| entry = self.chunks.get(id) | ||
| if entry is None: | ||
| raise self.ObjectNotFound(id, str(self._location)) | ||
| logger.warning("ignoring deletion of %s in %s", bin_to_hex(id), bin_to_hex(entry.pack_id)) | ||
| pack_id = entry.pack_id | ||
| # keep every other object in the same pack; compact_pack rewrites the pack without the target. | ||
| # work off the live index (each entry already carries its offset/size after flush): do not | ||
| # rebuild from the packs here, that could not tell the target apart from stale duplicate copies | ||
| # of the same id left in other packs by re-puts (build keeps an arbitrary one of them). | ||
| keep_ids = {cid for cid, e in self.chunks.iteritems() if e.pack_id == pack_id} | ||
| keep_ids.discard(id) | ||
| self.compact_pack(pack_id, keep_ids=keep_ids, drop_ids={id}) | ||
| # persist a full index so a following borg process sees the deletion; close() only writes new | ||
| # entries incrementally and would not record the removal. | ||
| write_chunkindex_to_repo(self, self.chunks, incremental=False, force_write=True, delete_other=True) | ||
|
|
||
| def compact_pack(self, pack_id, *, keep_ids: set, drop_ids: set): | ||
| """Rewrite pack <pack_id>, keeping <keep_ids> and dropping <drop_ids>, then delete the old pack. | ||
|
|
||
| keep_ids and drop_ids are sets of chunk ids that must together cover the whole pack (asserted: | ||
| their ranges tile it with no gap or overlap, and their intersection is empty). Kept objects are | ||
| keep_ids and drop_ids are sets of chunk ids that must together be every object of the pack, as | ||
| recorded in the chunk index. The caller guarantees that completeness (both callers build the two | ||
| sets by iterating the index for this pack_id); here we only assert their ranges tile contiguously | ||
| from offset 0 with no gap or overlap, and that their intersection is empty. Kept objects are | ||
| copied into a new pack via store.defrag and repointed in the chunk index; dropped objects' index | ||
| entries are removed. | ||
|
|
||
|
|
@@ -838,15 +855,17 @@ def compact_pack(self, pack_id, *, keep_ids: set, drop_ids: set): | |
| located.append((entry.obj_offset, obj_id, entry.obj_size, keep)) | ||
| located.sort() | ||
|
|
||
| # keep + drop must tile the whole pack; collect the objects to keep in the same pass. | ||
| # keep + drop tile the pack contiguously from offset 0; collect the objects to keep in the same | ||
| # pass. we do not cross-check against the pack's on-disk size: that needs a store.info() HEAD, | ||
| # which the stdio rest backend mishandles (it blocks reading a body the HEAD response never | ||
| # carries), and the caller already guarantees the two sets are the pack's complete object set. | ||
|
Comment on lines
+859
to
+861
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That sounds like it should be in a issue on the borgstore repository. |
||
| kept = [] # (obj_offset, obj_id, obj_size), offset-ordered | ||
| covered = 0 | ||
| for offset, obj_id, size, keep in located: | ||
| assert offset == covered, f"gap or overlap in pack {bin_to_hex(pack_id)} at offset {covered}" | ||
| covered += size | ||
| if keep: | ||
| kept.append((offset, obj_id, size)) | ||
| assert covered == self.store.info(pack_key).size, f"pack {bin_to_hex(pack_id)} not fully covered" | ||
|
|
||
| for drop_id in drop_ids: # remove dropped objects from the index; their bytes are not copied forward | ||
| del self.chunks[drop_id] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,6 @@ | |
| from ...constants import * # NOQA | ||
| from ...helpers import Location, umount | ||
| from ...helpers import EXIT_SUCCESS | ||
| from ...helpers import bin_to_hex | ||
| from ...helpers import init_ec_warnings | ||
| from ...logger import flush_logging | ||
| from ...manifest import Manifest | ||
|
|
@@ -181,17 +180,8 @@ def open_archive(repo_path, name): | |
|
|
||
|
|
||
| def delete_chunk(repository, id): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guess you just implemented It is not very fast, but considering it is rarely ever used, this is not a problem. |
||
| """Drop the pack holding chunk `id` (test damage helper). | ||
|
|
||
| Repository.delete is a no-op now, so tests that need a chunk to really vanish drop its whole | ||
| pack at the store level. Works at N=1 (one chunk per pack). The pack is resolved through the | ||
| chunk index, since the pack file name is the pack_id, which need not equal the chunk_id. | ||
| """ | ||
| entry = repository.chunks.get(id) | ||
| if entry is not None: | ||
| repository.store_delete("packs/" + bin_to_hex(entry.pack_id)) | ||
| else: | ||
| raise Repository.ObjectNotFound(id, repository) | ||
| """Remove a single chunk from the repo, leaving the rest of its pack intact (test damage helper).""" | ||
| repository.delete(id) | ||
|
Comment on lines
182
to
+184
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a special reason why you kept this wrapper? |
||
|
|
||
|
|
||
| def open_repository(archiver): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a circular import issue if you import this on module level?