|
| 1 | +// Package backup implements the per-adapter logical-backup format defined in |
| 2 | +// docs/design/2026_04_29_proposed_snapshot_logical_decoder.md (Phase 0) and |
| 3 | +// reused by docs/design/2026_04_29_proposed_logical_backup.md (Phase 1). |
| 4 | +// |
| 5 | +// This file owns the filename encoding rules for non-S3 segments. S3 object |
| 6 | +// keys preserve their `/` separators (and so are not transformed by EncodeSegment); |
| 7 | +// every other adapter scope encodes user-supplied bytes through this path. |
| 8 | +// |
| 9 | +// Encoding rules (see "Filename encoding" in the Phase 0 doc): |
| 10 | +// |
| 11 | +// - Bytes in the unreserved set [A-Za-z0-9._-] pass through. |
| 12 | +// - Every other byte is rendered as %HH (uppercase hex), like |
| 13 | +// application/x-www-form-urlencoded but applied to every non-allowlisted byte. |
| 14 | +// - If the encoded result exceeds maxSegmentBytes (240), the segment is |
| 15 | +// replaced with <sha256-hex-prefix-32>__<truncated-original> and the full |
| 16 | +// original bytes must be recorded in KEYMAP.jsonl by the caller. |
| 17 | +// - Binary DynamoDB partition / sort keys take a separate "b64.<base64url>" |
| 18 | +// path so a binary key never collides with a string key whose hex encoding |
| 19 | +// happens to look like base64. EncodeBinarySegment emits that form. |
| 20 | +package backup |
| 21 | + |
| 22 | +import ( |
| 23 | + "crypto/sha256" |
| 24 | + "encoding/base64" |
| 25 | + "encoding/hex" |
| 26 | + "strings" |
| 27 | + |
| 28 | + "github.com/cockroachdb/errors" |
| 29 | +) |
| 30 | + |
| 31 | +const ( |
| 32 | + // maxSegmentBytes is the maximum length of a single encoded path segment |
| 33 | + // before the SHA-fallback kicks in. Chosen to leave headroom under the |
| 34 | + // common NAME_MAX of 255: two-character percent escapes can grow a 240-byte |
| 35 | + // raw segment to 720 encoded bytes in the worst case, but any segment |
| 36 | + // large enough to overflow NAME_MAX after expansion takes the SHA-fallback |
| 37 | + // path before the encoded length is examined. |
| 38 | + maxSegmentBytes = 240 |
| 39 | + |
| 40 | + // shaFallbackHexPrefixBytes is the number of hex characters of SHA-256 |
| 41 | + // embedded in the SHA-fallback prefix. 32 hex chars == 128 bits of |
| 42 | + // hash-prefix entropy — enough to make accidental collision negligible |
| 43 | + // for any single scope. |
| 44 | + shaFallbackHexPrefixBytes = 32 |
| 45 | + |
| 46 | + // shaFallbackTruncatedSuffixBytes is the number of leading bytes of the |
| 47 | + // raw segment retained (after percent-encoding) in the SHA-fallback |
| 48 | + // rendering. Total encoded segment is then at most: |
| 49 | + // |
| 50 | + // shaFallbackHexPrefixBytes + len("__") + 3*shaFallbackTruncatedSuffixBytes |
| 51 | + // |
| 52 | + // = 32 + 2 + 3*64 = 226 bytes (under the 240 ceiling). |
| 53 | + // |
| 54 | + // The truncated suffix is purely a human-recognisability aid; it does |
| 55 | + // NOT carry enough information to reverse the original bytes — that is |
| 56 | + // what KEYMAP.jsonl is for. |
| 57 | + shaFallbackTruncatedSuffixBytes = 64 |
| 58 | + |
| 59 | + // binaryPrefix marks a DynamoDB B-attribute segment encoded as base64url. |
| 60 | + binaryPrefix = "b64." |
| 61 | + |
| 62 | + // shaFallbackSeparator separates the SHA-256 prefix from the truncated |
| 63 | + // original bytes. Two underscores rather than one because single |
| 64 | + // underscores are common in user keys; doubled is much rarer and so |
| 65 | + // the boundary is unambiguous. |
| 66 | + shaFallbackSeparator = "__" |
| 67 | +) |
| 68 | + |
| 69 | +// ErrInvalidEncodedSegment is returned by DecodeSegment when its input is |
| 70 | +// neither a valid percent-encoded segment, a binary-prefixed segment, nor a |
| 71 | +// SHA-fallback segment. |
| 72 | +var ErrInvalidEncodedSegment = errors.New("backup: invalid encoded filename segment") |
| 73 | + |
| 74 | +// ErrShaFallbackNeedsKeymap is returned by DecodeSegment when its input is a |
| 75 | +// SHA-fallback segment. The segment cannot be reversed to its original bytes |
| 76 | +// from the filename alone — the caller must consult KEYMAP.jsonl. |
| 77 | +var ErrShaFallbackNeedsKeymap = errors.New("backup: filename uses SHA fallback; consult KEYMAP.jsonl") |
| 78 | + |
| 79 | +// EncodeSegment encodes a single user-supplied path segment for use as a |
| 80 | +// filename component. It is the inverse of DecodeSegment for non-fallback |
| 81 | +// inputs. |
| 82 | +// |
| 83 | +// The encoding is deterministic given the same input. |
| 84 | +// |
| 85 | +// Three structural short-circuits ensure DecodeSegment cannot |
| 86 | +// misclassify a legitimate key: |
| 87 | +// |
| 88 | +// - If `raw` is longer than maxSegmentBytes, even a fully-unreserved |
| 89 | +// encoding (1:1) cannot fit, so we go straight to shaFallback. |
| 90 | +// This also caps the percent-encode allocation at |
| 91 | +// ~maxSegmentBytes, preventing OOM on adversarial input. |
| 92 | +// - If the percent-encoded form happens to match the SHA-fallback |
| 93 | +// shape (32 hex chars followed by "__"), we promote it to a real |
| 94 | +// SHA-fallback so DecodeSegment's structural detection cannot |
| 95 | +// fabricate a wrong original. |
| 96 | +// - If the percent-encoded form starts with the binary "b64." |
| 97 | +// prefix, we promote to SHA-fallback for the same reason: a |
| 98 | +// plain string key like "b64.foo" would otherwise be decoded as |
| 99 | +// base64 and produce different bytes on round-trip. |
| 100 | +// |
| 101 | +// Both promoted-fallback paths leave the original in KEYMAP.jsonl |
| 102 | +// (a correctness dependency, per the package doc), so exact-byte |
| 103 | +// recovery is preserved. |
| 104 | +func EncodeSegment(raw []byte) string { |
| 105 | + if len(raw) > maxSegmentBytes { |
| 106 | + // 1:1 lower bound on encoded length; cannot fit. |
| 107 | + return shaFallback(raw) |
| 108 | + } |
| 109 | + encoded, ok := percentEncodeBounded(raw, maxSegmentBytes) |
| 110 | + if !ok || isShaFallback(encoded) || strings.HasPrefix(encoded, binaryPrefix) { |
| 111 | + return shaFallback(raw) |
| 112 | + } |
| 113 | + return encoded |
| 114 | +} |
| 115 | + |
| 116 | +// percentEncodeBounded percent-encodes raw, bailing out as soon as the |
| 117 | +// in-progress output would exceed maxLen. Returns ("", false) on |
| 118 | +// overflow so the caller can take the SHA-fallback path without |
| 119 | +// having allocated the full 3*len(raw) buffer that the unbounded |
| 120 | +// variant would. Returns (encoded, true) on success. |
| 121 | +func percentEncodeBounded(raw []byte, maxLen int) (string, bool) { |
| 122 | + const escapeBytes = 3 // len("%HH") -- one escape's worst-case width |
| 123 | + cap := escapeBytes * len(raw) |
| 124 | + if cap > maxLen+escapeBytes { |
| 125 | + cap = maxLen + escapeBytes |
| 126 | + } |
| 127 | + var b strings.Builder |
| 128 | + b.Grow(cap) |
| 129 | + for _, c := range raw { |
| 130 | + if isUnreserved(c) { |
| 131 | + if b.Len()+1 > maxLen { |
| 132 | + return "", false |
| 133 | + } |
| 134 | + b.WriteByte(c) |
| 135 | + continue |
| 136 | + } |
| 137 | + if b.Len()+escapeBytes > maxLen { |
| 138 | + return "", false |
| 139 | + } |
| 140 | + b.WriteByte('%') |
| 141 | + b.WriteByte(hexUpper(c >> 4)) //nolint:mnd // 4 == nibble width |
| 142 | + b.WriteByte(hexUpper(c & 0x0F)) //nolint:mnd // 0x0F == low-nibble mask |
| 143 | + } |
| 144 | + return b.String(), true |
| 145 | +} |
| 146 | + |
| 147 | +// EncodeBinarySegment encodes a DynamoDB B-attribute (binary) segment as |
| 148 | +// "b64.<base64url-no-padding>" so that binary keys never collide with string |
| 149 | +// keys whose hex-encoding happens to look like base64. |
| 150 | +// |
| 151 | +// Short-circuits the SHA-fallback for inputs whose base64 expansion (~4/3 of |
| 152 | +// the raw length, plus the 4-byte "b64." prefix) would always overflow |
| 153 | +// maxSegmentBytes. As with EncodeSegment, this avoids an unnecessary large |
| 154 | +// allocation when the result would have been discarded anyway. |
| 155 | +func EncodeBinarySegment(raw []byte) string { |
| 156 | + if base64.RawURLEncoding.EncodedLen(len(raw))+len(binaryPrefix) > maxSegmentBytes { |
| 157 | + return shaFallback(raw) |
| 158 | + } |
| 159 | + enc := binaryPrefix + base64.RawURLEncoding.EncodeToString(raw) |
| 160 | + if len(enc) > maxSegmentBytes { |
| 161 | + return shaFallback(raw) |
| 162 | + } |
| 163 | + return enc |
| 164 | +} |
| 165 | + |
| 166 | +// DecodeSegment is the inverse of EncodeSegment for percent-encoded and |
| 167 | +// binary-prefixed inputs. SHA-fallback inputs return ErrShaFallbackNeedsKeymap |
| 168 | +// so the caller knows to consult KEYMAP.jsonl rather than treat the partial |
| 169 | +// suffix as the original key. |
| 170 | +// |
| 171 | +// As a defensive measure DecodeSegment refuses inputs longer than |
| 172 | +// maxSegmentBytes. EncodeSegment never produces such inputs, so any caller |
| 173 | +// passing one is either reading a corrupted dump or has a bug; either way the |
| 174 | +// percentDecode allocation should not run. |
| 175 | +func DecodeSegment(seg string) ([]byte, error) { |
| 176 | + if len(seg) > maxSegmentBytes { |
| 177 | + return nil, errors.Wrapf(ErrInvalidEncodedSegment, |
| 178 | + "segment length %d exceeds maximum %d", len(seg), maxSegmentBytes) |
| 179 | + } |
| 180 | + if isShaFallback(seg) { |
| 181 | + return nil, errors.WithStack(ErrShaFallbackNeedsKeymap) |
| 182 | + } |
| 183 | + if strings.HasPrefix(seg, binaryPrefix) { |
| 184 | + raw, err := base64.RawURLEncoding.DecodeString(seg[len(binaryPrefix):]) |
| 185 | + if err != nil { |
| 186 | + return nil, errors.Wrap(ErrInvalidEncodedSegment, err.Error()) |
| 187 | + } |
| 188 | + return raw, nil |
| 189 | + } |
| 190 | + return percentDecode(seg) |
| 191 | +} |
| 192 | + |
| 193 | +// IsShaFallback reports whether seg uses the SHA-prefix-and-truncated-original |
| 194 | +// form. Such segments cannot be reversed without KEYMAP.jsonl. |
| 195 | +func IsShaFallback(seg string) bool { |
| 196 | + return isShaFallback(seg) |
| 197 | +} |
| 198 | + |
| 199 | +// IsBinarySegment reports whether seg is a base64-url encoded binary segment |
| 200 | +// emitted by EncodeBinarySegment. |
| 201 | +func IsBinarySegment(seg string) bool { |
| 202 | + return strings.HasPrefix(seg, binaryPrefix) |
| 203 | +} |
| 204 | + |
| 205 | +func percentEncode(raw []byte) string { |
| 206 | + // Worst case: every byte expands to %HH (3 bytes). Pre-allocate. |
| 207 | + var b strings.Builder |
| 208 | + b.Grow(len(raw) * 3) //nolint:mnd // 3 == len("%HH"), local idiom |
| 209 | + for _, c := range raw { |
| 210 | + if isUnreserved(c) { |
| 211 | + b.WriteByte(c) |
| 212 | + continue |
| 213 | + } |
| 214 | + b.WriteByte('%') |
| 215 | + b.WriteByte(hexUpper(c >> 4)) //nolint:mnd // 4 == nibble width |
| 216 | + b.WriteByte(hexUpper(c & 0x0F)) //nolint:mnd // 0x0F == low-nibble mask |
| 217 | + } |
| 218 | + return b.String() |
| 219 | +} |
| 220 | + |
| 221 | +func percentDecode(seg string) ([]byte, error) { |
| 222 | + out := make([]byte, 0, len(seg)) |
| 223 | + for i := 0; i < len(seg); i++ { |
| 224 | + c := seg[i] |
| 225 | + if c != '%' { |
| 226 | + if !isUnreserved(c) { |
| 227 | + return nil, errors.Wrapf(ErrInvalidEncodedSegment, |
| 228 | + "unexpected raw byte 0x%02x at offset %d", c, i) |
| 229 | + } |
| 230 | + out = append(out, c) |
| 231 | + continue |
| 232 | + } |
| 233 | + if i+2 >= len(seg) { //nolint:mnd // 2 == hex digit count after % |
| 234 | + return nil, errors.Wrapf(ErrInvalidEncodedSegment, |
| 235 | + "truncated percent escape at offset %d", i) |
| 236 | + } |
| 237 | + const ( |
| 238 | + hiNibbleOff = 1 |
| 239 | + loNibbleOff = 2 |
| 240 | + ) |
| 241 | + hi, ok := unhex(seg[i+hiNibbleOff]) |
| 242 | + if !ok { |
| 243 | + return nil, errors.Wrapf(ErrInvalidEncodedSegment, |
| 244 | + "non-hex digit 0x%02x at offset %d", seg[i+hiNibbleOff], i+hiNibbleOff) |
| 245 | + } |
| 246 | + lo, ok := unhex(seg[i+loNibbleOff]) |
| 247 | + if !ok { |
| 248 | + return nil, errors.Wrapf(ErrInvalidEncodedSegment, |
| 249 | + "non-hex digit 0x%02x at offset %d", seg[i+loNibbleOff], i+loNibbleOff) |
| 250 | + } |
| 251 | + out = append(out, (hi<<4)|lo) //nolint:mnd // 4 == nibble width |
| 252 | + i += loNibbleOff // skip the two consumed hex digits |
| 253 | + } |
| 254 | + return out, nil |
| 255 | +} |
| 256 | + |
| 257 | +func shaFallback(raw []byte) string { |
| 258 | + sum := sha256.Sum256(raw) |
| 259 | + hashHex := hex.EncodeToString(sum[:])[:shaFallbackHexPrefixBytes] |
| 260 | + suffix := raw |
| 261 | + if len(suffix) > shaFallbackTruncatedSuffixBytes { |
| 262 | + suffix = suffix[:shaFallbackTruncatedSuffixBytes] |
| 263 | + } |
| 264 | + return hashHex + shaFallbackSeparator + percentEncode(suffix) |
| 265 | +} |
| 266 | + |
| 267 | +func isShaFallback(seg string) bool { |
| 268 | + if len(seg) < shaFallbackHexPrefixBytes+len(shaFallbackSeparator) { |
| 269 | + return false |
| 270 | + } |
| 271 | + for i := 0; i < shaFallbackHexPrefixBytes; i++ { |
| 272 | + if _, ok := unhex(seg[i]); !ok { |
| 273 | + return false |
| 274 | + } |
| 275 | + } |
| 276 | + return seg[shaFallbackHexPrefixBytes:shaFallbackHexPrefixBytes+len(shaFallbackSeparator)] == shaFallbackSeparator |
| 277 | +} |
| 278 | + |
| 279 | +// isUnreserved is the RFC3986 unreserved set: ALPHA / DIGIT / "-" / "." / "_". |
| 280 | +// "~" is excluded because it has caused interop problems with older shells and |
| 281 | +// the additional safety is not worth the rare benefit. |
| 282 | +func isUnreserved(c byte) bool { |
| 283 | + switch { |
| 284 | + case c >= 'A' && c <= 'Z': |
| 285 | + return true |
| 286 | + case c >= 'a' && c <= 'z': |
| 287 | + return true |
| 288 | + case c >= '0' && c <= '9': |
| 289 | + return true |
| 290 | + case c == '-', c == '.', c == '_': |
| 291 | + return true |
| 292 | + } |
| 293 | + return false |
| 294 | +} |
| 295 | + |
| 296 | +func hexUpper(nibble byte) byte { |
| 297 | + if nibble < 10 { //nolint:mnd // 10 == decimal/hex boundary |
| 298 | + return '0' + nibble |
| 299 | + } |
| 300 | + return 'A' + (nibble - 10) //nolint:mnd // 10 == decimal/hex boundary |
| 301 | +} |
| 302 | + |
| 303 | +func unhex(c byte) (byte, bool) { |
| 304 | + switch { |
| 305 | + case c >= '0' && c <= '9': |
| 306 | + return c - '0', true |
| 307 | + case c >= 'a' && c <= 'f': |
| 308 | + return c - 'a' + 10, true //nolint:mnd // 10 == decimal/hex boundary |
| 309 | + case c >= 'A' && c <= 'F': |
| 310 | + return c - 'A' + 10, true //nolint:mnd // 10 == decimal/hex boundary |
| 311 | + } |
| 312 | + return 0, false |
| 313 | +} |
0 commit comments