|
| 1 | +package backup |
| 2 | + |
| 3 | +import ( |
| 4 | + "io" |
| 5 | + "os" |
| 6 | + "path/filepath" |
| 7 | + "strings" |
| 8 | + |
| 9 | + "github.com/cockroachdb/errors" |
| 10 | +) |
| 11 | + |
| 12 | +// encode_s3_collision.go is the M4-2b helper that inverts the S3 |
| 13 | +// decoder's collision-rename writes. The decoder writes a per-bucket |
| 14 | +// KEYMAP.jsonl with one record per renamed object (KindS3LeafData, |
| 15 | +// KindMetaCollision, KindSHAFallback); this file is the read-side that |
| 16 | +// turns that file back into a map[encoded-rel-path]KeymapRecord the |
| 17 | +// object walk consults to recover original S3 keys. |
| 18 | +// |
| 19 | +// Design: docs/design/2026_06_03_proposed_s3_collision_reversal.md. |
| 20 | +// |
| 21 | +// File-system safety: loadBucketKeymap re-runs the read-side |
| 22 | +// Lstat → refuseHardLink → Open sequence rather than reusing |
| 23 | +// OpenSidecarFile. OpenSidecarFile is write-side |
| 24 | +// (O_WRONLY|O_CREATE|O_TRUNC on Windows/fallback, O_WRONLY|O_CREATE |
| 25 | +// + Truncate(0) on Unix) and would erase KEYMAP.jsonl before the |
| 26 | +// reader could see it. Codex P2 v913 v3. |
| 27 | +// |
| 28 | +// Duplicate-key contract: this loader DIVERGES from LoadKeymap |
| 29 | +// (keymap.go:245) which is last-wins. The S3 decoder's recordKeymap |
| 30 | +// writes exactly one entry per renamed object, so a duplicate |
| 31 | +// `Encoded` value in a tracker file means the dump is corrupt — the |
| 32 | +// loader fails closed. Claude v913 v2 caught this contract mismatch. |
| 33 | + |
| 34 | +// keymapReservedRoots is the closed set of first-path-component |
| 35 | +// names disallowed as the head of a KEYMAP `Original` key. Only |
| 36 | +// _orphans/ and _incomplete_uploads/ are dump-control reserved |
| 37 | +// subtrees; legitimate user object keys like "_foo" or "_foo/bar" |
| 38 | +// remain valid. Codex P2 v913 v1. |
| 39 | +var keymapReservedRoots = map[string]struct{}{ |
| 40 | + "_orphans": {}, |
| 41 | + "_incomplete_uploads": {}, |
| 42 | +} |
| 43 | + |
| 44 | +// keymapAllowedKinds is the closed set of Kind values M4-2b honors. |
| 45 | +// A Kind outside this set fails closed so a hand-edited dump that |
| 46 | +// injects a novel discriminator cannot silently bypass invariants |
| 47 | +// (forward-compat guard, claude v913 v2). |
| 48 | +var keymapAllowedKinds = map[string]struct{}{ |
| 49 | + KindS3LeafData: {}, |
| 50 | + KindMetaCollision: {}, |
| 51 | + KindSHAFallback: {}, |
| 52 | +} |
| 53 | + |
| 54 | +// loadBucketKeymap reads <bucketDir>/KEYMAP.jsonl into a |
| 55 | +// map[Encoded]KeymapRecord. Precondition: the caller has already |
| 56 | +// verified via isKeymapCollisionTracker that the file exists, is a |
| 57 | +// regular file, and has no companion .elastickv-meta.json sidecar |
| 58 | +// (sidecar-paired KEYMAP.jsonl is a legitimate user object handled |
| 59 | +// by the normal object walk; codex P2 v913 v2). |
| 60 | +// |
| 61 | +// All four invariants in the design doc's "Error contract" section |
| 62 | +// fire from here — duplicate Encoded, malformed JSON, suffix- |
| 63 | +// mismatch per Kind, reserved-root, and unknown Kind. Orphan-record |
| 64 | +// detection (Encoded with no on-disk body) happens in |
| 65 | +// verifyKeymapTargetsExist after the load, because it requires the |
| 66 | +// full keymap to be parsed first. |
| 67 | +func (e *S3RecordEncoder) loadBucketKeymap(root *os.Root, bucketDir string) (map[string]KeymapRecord, error) { |
| 68 | + keymapRel := filepath.Join(bucketDir, "KEYMAP.jsonl") |
| 69 | + // Read-side Lstat + refuseHardLink + Open. NOT OpenSidecarFile |
| 70 | + // (write-side; truncates before read). Mirrors openRootRegular |
| 71 | + // (encode_s3_objects.go:260) but tolerates os.ErrNotExist as a |
| 72 | + // no-op return (defensive — caller's precondition already |
| 73 | + // covered the Lstat once, but interleaved with our re-Lstat the |
| 74 | + // file could vanish; treat that as "no records" rather than |
| 75 | + // failing closed). |
| 76 | + linfo, err := root.Lstat(keymapRel) |
| 77 | + if errors.Is(err, os.ErrNotExist) { |
| 78 | + return nil, nil |
| 79 | + } |
| 80 | + if err != nil { |
| 81 | + return nil, errors.WithStack(err) |
| 82 | + } |
| 83 | + if !linfo.Mode().IsRegular() { |
| 84 | + return nil, errors.Wrapf(ErrS3EncodeNotRegular, "%s (mode=%s)", keymapRel, linfo.Mode()) |
| 85 | + } |
| 86 | + if err := refuseHardLink(linfo, keymapRel); err != nil { |
| 87 | + return nil, err |
| 88 | + } |
| 89 | + f, err := root.Open(keymapRel) |
| 90 | + if err != nil { |
| 91 | + return nil, errors.WithStack(err) |
| 92 | + } |
| 93 | + defer func() { _ = f.Close() }() |
| 94 | + |
| 95 | + return readKeymapStrict(f, keymapRel) |
| 96 | +} |
| 97 | + |
| 98 | +// readKeymapStrict iterates a KEYMAP.jsonl stream and assembles the |
| 99 | +// map, failing closed on duplicate Encoded values (vs LoadKeymap's |
| 100 | +// last-wins) and on every per-record invariant. Split out of |
| 101 | +// loadBucketKeymap so the parent stays under the cyclop limit. |
| 102 | +func readKeymapStrict(f io.Reader, keymapRel string) (map[string]KeymapRecord, error) { |
| 103 | + out := make(map[string]KeymapRecord) |
| 104 | + rd := NewKeymapReader(f) |
| 105 | + for { |
| 106 | + rec, ok, err := rd.Next() |
| 107 | + if err != nil { |
| 108 | + return nil, err |
| 109 | + } |
| 110 | + if !ok { |
| 111 | + return out, nil |
| 112 | + } |
| 113 | + if _, dup := out[rec.Encoded]; dup { |
| 114 | + return nil, errors.Wrapf(ErrInvalidKeymapRecord, |
| 115 | + "%s: duplicate encoded segment %q (S3 decoder writes one record per rename; a duplicate means the dump is corrupt)", |
| 116 | + keymapRel, rec.Encoded) |
| 117 | + } |
| 118 | + if err := validateKeymapRecord(rec); err != nil { |
| 119 | + return nil, errors.Wrapf(err, "%s", keymapRel) |
| 120 | + } |
| 121 | + out[rec.Encoded] = rec |
| 122 | + } |
| 123 | +} |
| 124 | + |
| 125 | +// validateKeymapRecord enforces the per-record invariants of the |
| 126 | +// design doc's "Error contract" section: Kind must be in the closed |
| 127 | +// set; KindS3LeafData entries must end in S3LeafDataSuffix; |
| 128 | +// KindMetaCollision entries' original key must end in |
| 129 | +// S3MetaSuffixReserved; no entry's original key may target a |
| 130 | +// reserved top-level dump subtree. |
| 131 | +func validateKeymapRecord(rec KeymapRecord) error { |
| 132 | + if _, ok := keymapAllowedKinds[rec.Kind]; !ok { |
| 133 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 134 | + "unknown kind %q for encoded segment %q", rec.Kind, rec.Encoded) |
| 135 | + } |
| 136 | + if err := validateKeymapEncodedPath(rec.Encoded); err != nil { |
| 137 | + return err |
| 138 | + } |
| 139 | + originalBytes, err := rec.Original() |
| 140 | + if err != nil { |
| 141 | + return err |
| 142 | + } |
| 143 | + original := string(originalBytes) |
| 144 | + // S3 object keys cannot be empty; reject an empty decoded |
| 145 | + // original here so the operator sees the keymap problem at load |
| 146 | + // time rather than at object-body assembly. Gemini medium PR #928. |
| 147 | + if original == "" { |
| 148 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 149 | + "empty original key for encoded segment %q (kind=%q)", rec.Encoded, rec.Kind) |
| 150 | + } |
| 151 | + // Mirror the decoder's safeJoinUnderRoot (s3.go:808) restrictions |
| 152 | + // on the decoded original key. A corrupt KEYMAP entry can decode |
| 153 | + // to a key shape the decoder's restore path refuses (NUL byte, |
| 154 | + // backslash, `.`/`..` segments, empty segments, leading slash). |
| 155 | + // Without this gate the encoder would emit !s3 records under the |
| 156 | + // rejected key and the resulting snapshot would fail at restore |
| 157 | + // instead of fail-closed at keymap load time. Codex P2 #928 round 3. |
| 158 | + if err := validateKeymapOriginalPath(rec, original); err != nil { |
| 159 | + return err |
| 160 | + } |
| 161 | + if err := validateKeymapReservedRoot(rec, original); err != nil { |
| 162 | + return err |
| 163 | + } |
| 164 | + return validateKeymapKindSuffix(rec, original) |
| 165 | +} |
| 166 | + |
| 167 | +// validateKeymapOriginalPath mirrors the decoder's |
| 168 | +// safeJoinUnderRoot restrictions (s3.go:808) on the decoded |
| 169 | +// original key. The decoder refuses NUL bytes (POSIX cannot |
| 170 | +// represent them), backslashes (Windows separator that bypasses |
| 171 | +// the slash-segment scan), `.` / `..` segments (S3 treats them |
| 172 | +// literally; filepath.Clean would silently merge distinct keys), |
| 173 | +// and empty segments (consecutive `/` or leading `/`). A KEYMAP |
| 174 | +// entry recovering such a key would emit !s3 records the decoder |
| 175 | +// could never re-materialise during restore. Codex P2 #928 round 3. |
| 176 | +func validateKeymapOriginalPath(rec KeymapRecord, original string) error { |
| 177 | + if strings.ContainsRune(original, 0) { |
| 178 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 179 | + "original key %q contains NUL byte (encoded=%q)", original, rec.Encoded) |
| 180 | + } |
| 181 | + if strings.ContainsRune(original, '\\') { |
| 182 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 183 | + "original key %q contains backslash (encoded=%q; rename in S3 first)", original, rec.Encoded) |
| 184 | + } |
| 185 | + for _, seg := range strings.Split(original, "/") { |
| 186 | + switch seg { |
| 187 | + case "", ".", "..": |
| 188 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 189 | + "original key %q has invalid path segment %q (encoded=%q)", original, seg, rec.Encoded) |
| 190 | + } |
| 191 | + } |
| 192 | + return nil |
| 193 | +} |
| 194 | + |
| 195 | +// validateKeymapEncodedPath rejects rec.Encoded values that would |
| 196 | +// escape the bucket sub-root, carry empty segments, contain |
| 197 | +// backslashes filepath.Join splits on Windows, or name a |
| 198 | +// walker-skipped control/sidecar file that encodeObject never |
| 199 | +// reaches. |
| 200 | +// |
| 201 | +// Path segments: gemini medium PR #928. |
| 202 | +// Backslash + walker-skipped reserved names: codex P2 #928 round 4. |
| 203 | +// - The walker (walkObjectEntry) returns nil for top-level |
| 204 | +// "_bucket.json" and top-level "KEYMAP.jsonl" (when a tracker |
| 205 | +// is loaded) and for any file ending in S3MetaSuffixReserved. |
| 206 | +// A KEYMAP entry whose Encoded names one of those passes the |
| 207 | +// IsRegular check in verifyKeymapTargetsExist but is silently |
| 208 | +// ignored by the walk - the corruption surfaces only as a |
| 209 | +// missing-key during restore. |
| 210 | +func validateKeymapEncodedPath(encoded string) error { |
| 211 | + if strings.ContainsRune(encoded, '\\') { |
| 212 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 213 | + "encoded path %q contains backslash (treated as a separator on Windows; rename in S3 first)", encoded) |
| 214 | + } |
| 215 | + segs := strings.Split(encoded, "/") |
| 216 | + for _, seg := range segs { |
| 217 | + if seg == "" || seg == "." || seg == ".." { |
| 218 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 219 | + "encoded path %q contains invalid segment %q", encoded, seg) |
| 220 | + } |
| 221 | + } |
| 222 | + return validateKeymapEncodedPathWalkerSkipped(encoded, segs) |
| 223 | +} |
| 224 | + |
| 225 | +// validateKeymapEncodedPathWalkerSkipped rejects encoded paths the |
| 226 | +// object walk would silently skip — sidecar suffixes at any depth, |
| 227 | +// top-level "_bucket.json" / "KEYMAP.jsonl" control files, and any |
| 228 | +// path rooted in a reserved dump subtree (_orphans / |
| 229 | +// _incomplete_uploads). Split out of validateKeymapEncodedPath so |
| 230 | +// the parent stays under cyclop. Codex P2 #928 round 4 + round 5. |
| 231 | +func validateKeymapEncodedPathWalkerSkipped(encoded string, segs []string) error { |
| 232 | + if strings.HasSuffix(segs[len(segs)-1], S3MetaSuffixReserved) { |
| 233 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 234 | + "encoded path %q ends in reserved sidecar suffix %q (walker would skip)", |
| 235 | + encoded, S3MetaSuffixReserved) |
| 236 | + } |
| 237 | + // _bucket.json and KEYMAP.jsonl are walker-skipped only at the |
| 238 | + // top level (rel == ""). Nested user keys like |
| 239 | + // "prefix/_bucket.json" are real object bodies that the walk |
| 240 | + // processes normally, so accept those. |
| 241 | + if len(segs) == 1 && (segs[0] == "_bucket.json" || segs[0] == "KEYMAP.jsonl") { |
| 242 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 243 | + "encoded path %q names a top-level control file the walker skips", encoded) |
| 244 | + } |
| 245 | + // Reserved-subtree first-segment guard. walkObjectSubdir skips |
| 246 | + // top-level "_orphans/" and "_incomplete_uploads/" when they |
| 247 | + // hold no sidecars (the dump's own payload). A KEYMAP entry |
| 248 | + // whose Encoded is under those subtrees passes the IsRegular |
| 249 | + // check on the existing dump-control payload (e.g. |
| 250 | + // "_orphans/foo.bin" or "_incomplete_uploads/records.jsonl") but |
| 251 | + // the walk never consumes it. The companion gate for the |
| 252 | + // decoded original key lives in validateKeymapReservedRoot; |
| 253 | + // this mirror catches the encoded side. Codex P2 #928 round 5. |
| 254 | + if _, reserved := keymapReservedRoots[segs[0]]; reserved { |
| 255 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 256 | + "encoded path %q targets reserved dump subtree %q (walker would skip)", encoded, segs[0]) |
| 257 | + } |
| 258 | + return nil |
| 259 | +} |
| 260 | + |
| 261 | +// validateKeymapKindSuffix enforces the Kind-specific suffix |
| 262 | +// invariants: KindS3LeafData encoded values must end in |
| 263 | +// S3LeafDataSuffix; KindMetaCollision original values must end in |
| 264 | +// S3MetaSuffixReserved; KindSHAFallback has no extra invariant |
| 265 | +// today (forward-compat). |
| 266 | +func validateKeymapKindSuffix(rec KeymapRecord, original string) error { |
| 267 | + switch rec.Kind { |
| 268 | + case KindS3LeafData: |
| 269 | + if !strings.HasSuffix(rec.Encoded, S3LeafDataSuffix) { |
| 270 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 271 | + "KindS3LeafData encoded %q missing %q suffix", rec.Encoded, S3LeafDataSuffix) |
| 272 | + } |
| 273 | + case KindMetaCollision: |
| 274 | + if !strings.HasSuffix(original, S3MetaSuffixReserved) { |
| 275 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 276 | + "KindMetaCollision original %q missing %q suffix", original, S3MetaSuffixReserved) |
| 277 | + } |
| 278 | + case KindSHAFallback: |
| 279 | + // Design §"KindSHAFallback policy for S3": currently |
| 280 | + // reserved for the (not-yet-implemented) case where a |
| 281 | + // single S3 key segment exceeds EncodeSegment's 240-byte |
| 282 | + // filesystem ceiling. The decoder does not emit it for S3 |
| 283 | + // today, but if a future decoder does, the encoder's |
| 284 | + // full-rel-path lookup handles it transparently. No |
| 285 | + // additional invariant beyond the kind+reserved-root |
| 286 | + // checks above. |
| 287 | + } |
| 288 | + return nil |
| 289 | +} |
| 290 | + |
| 291 | +// validateKeymapReservedRoot rejects an original key whose first |
| 292 | +// slash-split path component is exactly one of the dump-control |
| 293 | +// reserved subtrees (_orphans, _incomplete_uploads). Codex P2 v913 v1: |
| 294 | +// the whole `_` namespace is NOT reserved, only those two named |
| 295 | +// subtrees. Legit user keys like "_foo" or "_foo/bar" or |
| 296 | +// "nested/_orphans/x" pass. |
| 297 | +func validateKeymapReservedRoot(rec KeymapRecord, original string) error { |
| 298 | + if original == "" { |
| 299 | + return nil |
| 300 | + } |
| 301 | + firstSegment, _, _ := strings.Cut(original, "/") |
| 302 | + if _, reserved := keymapReservedRoots[firstSegment]; reserved { |
| 303 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 304 | + "original key %q targets reserved dump subtree %q (encoded=%q, kind=%q)", |
| 305 | + original, firstSegment, rec.Encoded, rec.Kind) |
| 306 | + } |
| 307 | + return nil |
| 308 | +} |
| 309 | + |
| 310 | +// verifyKeymapTargetsExist scans the loaded keymap and confirms each |
| 311 | +// record's Encoded value points to an actual on-disk REGULAR FILE |
| 312 | +// under bucketDir. Orphan records (Encoded with no body) surface |
| 313 | +// here rather than as a missing-sidecar error from the walker, so |
| 314 | +// the operator sees the keymap inconsistency directly. O(N) Lstats |
| 315 | +// per bucket, run once at load time. |
| 316 | +// |
| 317 | +// The target MUST be a regular file. A directory at the target path |
| 318 | +// (e.g. a prefix created by another object) would pass a bare |
| 319 | +// existence check but the walk-time encodeObject path would never |
| 320 | +// run for it — encodeBucketObjects recurses into the directory and |
| 321 | +// silently ignores the keymap entry. Fail closed here. Codex P2 #928. |
| 322 | +func (e *S3RecordEncoder) verifyKeymapTargetsExist(root *os.Root, bucketDir string, km map[string]KeymapRecord) error { |
| 323 | + for encoded := range km { |
| 324 | + // Encoded is a bucket-relative path in slash-form (that's |
| 325 | + // what the decoder's recordKeymap stored; see s3.go:728 |
| 326 | + // — the call site passes resolveObjectFilename's output, |
| 327 | + // which is the slash-form S3 key + optional rename |
| 328 | + // suffix). Convert to filepath form before Lstat. |
| 329 | + rel := filepath.Join(bucketDir, filepath.FromSlash(encoded)) |
| 330 | + info, err := root.Lstat(rel) |
| 331 | + if errors.Is(err, os.ErrNotExist) { |
| 332 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 333 | + "orphan keymap record: encoded %q has no on-disk file under %s", encoded, bucketDir) |
| 334 | + } |
| 335 | + if err != nil { |
| 336 | + return errors.WithStack(err) |
| 337 | + } |
| 338 | + if !info.Mode().IsRegular() { |
| 339 | + return errors.Wrapf(ErrInvalidKeymapRecord, |
| 340 | + "keymap record %q resolves to %s (mode=%s), want regular file", encoded, rel, info.Mode()) |
| 341 | + } |
| 342 | + } |
| 343 | + return nil |
| 344 | +} |
| 345 | + |
| 346 | +// resolveObjectKeyFromRel maps an on-disk bucket-relative path to |
| 347 | +// the original S3 object key. The keymap is the loaded |
| 348 | +// per-bucket map (nil for buckets with no KEYMAP.jsonl); a hit |
| 349 | +// returns the decoded original key bytes as a string, a miss |
| 350 | +// returns the slash-form of objRel itself (the no-collision case |
| 351 | +// M4-2a already covered). |
| 352 | +// |
| 353 | +// The lookup uses the slash-form of objRel because the decoder's |
| 354 | +// recordKeymap call site (s3.go:728) stored that exact form as |
| 355 | +// the Encoded field (the encodedFilename argument is the |
| 356 | +// resolveObjectFilename return, which is plain slash-form S3 |
| 357 | +// path + optional suffix). |
| 358 | +func resolveObjectKeyFromRel(keymap map[string]KeymapRecord, objRel string) (string, error) { |
| 359 | + slashRel := filepath.ToSlash(objRel) |
| 360 | + if keymap == nil { |
| 361 | + return slashRel, nil |
| 362 | + } |
| 363 | + rec, ok := keymap[slashRel] |
| 364 | + if !ok { |
| 365 | + return slashRel, nil |
| 366 | + } |
| 367 | + originalBytes, err := rec.Original() |
| 368 | + if err != nil { |
| 369 | + return "", err |
| 370 | + } |
| 371 | + return string(originalBytes), nil |
| 372 | +} |
0 commit comments