Skip to content

Commit 9f9c70e

Browse files
committed
Merge branch 'fix/verify-branch-metadata-stability' into 'master'
fix: stabilize branch metadata verification and protect branch heads during cleanup See merge request postgres-ai/database-lab!1097
2 parents 149a107 + 98f32d9 commit 9f9c70e

6 files changed

Lines changed: 593 additions & 16 deletions

File tree

engine/internal/provision/thinclones/zfs/branching.go

Lines changed: 195 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -127,39 +127,224 @@ func (m *Manager) VerifyBranchMetadata() error {
127127
}
128128

129129
branchHeads := make(map[string]string)
130-
130+
branchRoots := make(map[string]string)
131+
parents := make(map[string]string, numberSnapshots)
132+
children := make(map[string][]string, numberSnapshots)
133+
134+
// Iterate oldest → newest to compute the chain in memory.
135+
// A snapshot can only have one parent in the ZFS chain. Fork-point snapshots
136+
// carry multiple branch tags but share the same predecessor, so the last
137+
// branch iteration produces the correct (and identical) parent value.
131138
for i := numberSnapshots; i > 0; i-- {
132139
sn := snapshots[i-1]
133140
log.Dbg(sn)
134141

135-
if err := m.DeleteBranchProp(sn.Branch, sn.ID); err != nil {
136-
return fmt.Errorf("failed to clean branch property: %w", err)
142+
for _, br := range splitBranches(sn.Branch) {
143+
head, ok := branchHeads[br]
144+
if !ok {
145+
branchHeads[br] = sn.ID
146+
branchRoots[br] = sn.ID
147+
148+
continue
149+
}
150+
151+
parents[sn.ID] = head
152+
children[head] = appendUnique(children[head], sn.ID)
153+
branchHeads[br] = sn.ID
137154
}
155+
}
138156

139-
head, ok := branchHeads[sn.Branch]
140-
if !ok {
141-
branchHeads[sn.Branch] = sn.ID
142-
continue
157+
// Restore cross-branch parent/child links using dle:root properties.
158+
rootProps, err := m.readRootProperties()
159+
if err != nil {
160+
log.Warn(fmt.Sprintf("failed to read root properties, skipping cross-branch link restoration: %v", err))
161+
}
162+
163+
for forkSnap, branches := range rootProps {
164+
for _, br := range branches {
165+
oldest, ok := branchRoots[br]
166+
if !ok {
167+
continue
168+
}
169+
170+
if parents[oldest] != "" {
171+
continue
172+
}
173+
174+
parents[oldest] = forkSnap
175+
children[forkSnap] = append(children[forkSnap], oldest)
143176
}
177+
}
144178

145-
if err := m.SetRelation(head, sn.ID); err != nil {
146-
return fmt.Errorf("failed to set snapshot relations: %w", err)
179+
// Read existing parent/child properties in bulk to avoid rewriting unchanged values.
180+
existing, err := m.readParentChildProperties()
181+
if err != nil {
182+
log.Warn(fmt.Sprintf("failed to read existing properties, will write all: %v", err))
183+
184+
existing = make(map[string]parentChild)
185+
}
186+
187+
// Write only changed parent/child properties.
188+
for _, sn := range snapshots {
189+
parentVal := parents[sn.ID]
190+
if parentVal == "" {
191+
parentVal = empty
192+
}
193+
194+
childVal := empty
195+
if c, ok := children[sn.ID]; ok {
196+
childVal = strings.Join(c, branchSep)
147197
}
148198

149-
branchHeads[sn.Branch] = sn.ID
199+
cur := existing[sn.ID]
200+
201+
if cur.parent != parentVal {
202+
if err := m.setProperty(parentProp, parentVal, sn.ID); err != nil {
203+
return fmt.Errorf("failed to set parent property for %s: %w", sn.ID, err)
204+
}
205+
}
206+
207+
if cur.child != childVal {
208+
if err := m.setProperty(childProp, childVal, sn.ID); err != nil {
209+
return fmt.Errorf("failed to set child property for %s: %w", sn.ID, err)
210+
}
211+
}
150212
}
151213

214+
// Assign branch tags to head snapshots before removing stale ones,
215+
// so the tag is never absent from all snapshots at once.
152216
for brName, latestID := range branchHeads {
153217
if err := m.AddBranchProp(brName, latestID); err != nil {
154218
return fmt.Errorf("failed to add branch property: %w", err)
155219
}
156220
}
157221

222+
// Remove stale branch tags. A snapshot may be head of one branch but carry
223+
// a stale tag for another, so check each tag individually.
224+
for _, sn := range snapshots {
225+
for _, br := range splitBranches(sn.Branch) {
226+
if branchHeads[br] == sn.ID {
227+
continue
228+
}
229+
230+
if err := m.DeleteBranchProp(br, sn.ID); err != nil {
231+
log.Warn(fmt.Sprintf("failed to clean branch property for %s: %v", sn.ID, err))
232+
}
233+
}
234+
}
235+
158236
log.Msg("data branching has been verified")
159237

160238
return nil
161239
}
162240

241+
type parentChild struct {
242+
parent string
243+
child string
244+
}
245+
246+
// readParentChildProperties reads dle:parent and dle:child for all snapshots in one zfs call.
247+
func (m *Manager) readParentChildProperties() (map[string]parentChild, error) {
248+
cmd := fmt.Sprintf("zfs list -H -t snapshot -o name,%s,%s -r %s", parentProp, childProp, m.config.Pool.Name)
249+
250+
out, err := m.runner.Run(cmd)
251+
if err != nil {
252+
return nil, fmt.Errorf("failed to read parent/child properties: %w", err)
253+
}
254+
255+
result := make(map[string]parentChild)
256+
257+
const expectedColumns = 3
258+
259+
for _, line := range strings.Split(strings.TrimSpace(out), "\n") {
260+
fields := strings.SplitN(line, "\t", expectedColumns)
261+
if len(fields) != expectedColumns {
262+
continue
263+
}
264+
265+
result[fields[0]] = parentChild{
266+
parent: fields[1],
267+
child: fields[2],
268+
}
269+
}
270+
271+
return result, nil
272+
}
273+
274+
// readRootProperties reads dle:root for all snapshots in the pool.
275+
func (m *Manager) readRootProperties() (map[string][]string, error) {
276+
cmd := fmt.Sprintf("zfs list -H -t snapshot -o name,%s -r %s", rootProp, m.config.Pool.Name)
277+
278+
out, err := m.runner.Run(cmd)
279+
if err != nil {
280+
return nil, fmt.Errorf("failed to read root properties: %w", err)
281+
}
282+
283+
roots := make(map[string][]string)
284+
285+
const expectedColumns = 2
286+
287+
for _, line := range strings.Split(strings.TrimSpace(out), "\n") {
288+
fields := strings.SplitN(line, "\t", expectedColumns)
289+
if len(fields) != expectedColumns {
290+
continue
291+
}
292+
293+
rootVal := fields[1]
294+
if rootVal == "" || rootVal == empty {
295+
continue
296+
}
297+
298+
for _, br := range strings.Split(rootVal, branchSep) {
299+
br = strings.TrimSpace(br)
300+
if br != "" && br != empty {
301+
roots[fields[0]] = append(roots[fields[0]], br)
302+
}
303+
}
304+
}
305+
306+
return roots, nil
307+
}
308+
309+
func appendUnique(slice []string, val string) []string {
310+
for _, s := range slice {
311+
if s == val {
312+
return slice
313+
}
314+
}
315+
316+
return append(slice, val)
317+
}
318+
319+
// splitBranches parses a comma-separated branch property value into individual branch names.
320+
// Snapshots with no branch tag (empty or "-") default to the main branch to maintain
321+
// consistency with InitBranching, which assigns untagged snapshots to the default branch.
322+
func splitBranches(branch string) []string {
323+
if branch == "" || branch == empty {
324+
return []string{branching.DefaultBranch}
325+
}
326+
327+
if !strings.Contains(branch, branchSep) {
328+
return []string{branch}
329+
}
330+
331+
branches := make([]string, 0)
332+
333+
for _, b := range strings.Split(branch, branchSep) {
334+
b = strings.TrimSpace(b)
335+
336+
if b != "" && b != empty {
337+
branches = append(branches, b)
338+
}
339+
}
340+
341+
if len(branches) == 0 {
342+
return []string{branching.DefaultBranch}
343+
}
344+
345+
return branches
346+
}
347+
163348
// CreateBranch clones data as a new branch.
164349
func (m *Manager) CreateBranch(branchName, snapshotID string) error {
165350
// zfs clone -p pool@snapshot_20221019094237 pool/branch/001-branch

engine/internal/provision/thinclones/zfs/zfs.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,13 @@ func (m *Manager) CleanupSnapshots(retentionLimit int, mode models.RetrievalMode
588588

589589
busySnapshots := m.getBusySnapshotList(clonesOutput)
590590

591+
branchHeads, err := m.getBranchHeadSnapshots()
592+
if err != nil {
593+
return nil, fmt.Errorf("failed to determine protected snapshots: %w", err)
594+
}
595+
596+
busySnapshots = append(busySnapshots, branchHeads...)
597+
591598
modeFilter := ""
592599

593600
if mode == models.Physical {
@@ -770,6 +777,40 @@ func (m *Manager) getBusySnapshotList(clonesOutput string) []string {
770777
return busySnapshots
771778
}
772779

780+
// getBranchHeadSnapshots returns branch head snapshots and their upstream origin snapshots.
781+
// This prevents cleanup from destroying snapshots that serve as branch heads or
782+
// pre-snapshots whose clone chains contain branch heads.
783+
func (m *Manager) getBranchHeadSnapshots() ([]string, error) {
784+
branches, err := m.listBranches()
785+
if err != nil {
786+
return nil, fmt.Errorf("failed to list branches for cleanup protection: %w", err)
787+
}
788+
789+
protected := make([]string, 0, len(branches))
790+
791+
for _, snapshotID := range branches {
792+
protected = append(protected, snapshotID)
793+
794+
dataset, _, found := strings.Cut(snapshotID, "@")
795+
if !found {
796+
continue
797+
}
798+
799+
origin, err := m.runner.Run("zfs get -H -o value origin " + dataset)
800+
if err != nil {
801+
log.Dbg("failed to get origin for dataset", dataset, ":", err)
802+
continue
803+
}
804+
805+
origin = strings.TrimSpace(origin)
806+
if origin != "" && origin != "-" {
807+
protected = append(protected, origin)
808+
}
809+
}
810+
811+
return protected, nil
812+
}
813+
773814
// excludeBusySnapshots excludes snapshots that match a pattern by name.
774815
// The exclusion logic relies on the fact that snapshots have unique substrings (timestamps).
775816
func excludeBusySnapshots(busySnapshots []string) string {

0 commit comments

Comments
 (0)