Skip to content

Commit 25f5da3

Browse files
committed
backup: filename encoding for the logical backup format (Phase 0a foundation)
Implements internal/backup/{filename.go,filename_test.go} per the Phase 0 design doc (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md). Encoding rules (deterministic, reversible from the filename alone except for the SHA fallback): - RFC3986 unreserved set [A-Za-z0-9._-] passes through. - Other bytes -> %HH (uppercase hex), like form-urlencoded but applied to every non-allowlisted byte. - Segments exceeding 240 bytes after percent-encoding render as <sha256-hex-prefix-32>__<truncated-original> with the full original recorded in KEYMAP.jsonl by the caller. DecodeSegment refuses these with ErrShaFallbackNeedsKeymap so callers cannot fabricate the original bytes from the filename alone. - DynamoDB B-attribute (binary) keys take a separate "b64.<base64url>" path so a binary key never collides with a hex-shaped string key. Tests cover passthrough, percent escaping, hex-uppercase invariant, SHA-fallback firing on long inputs (raw or post-expansion), binary round-trip, malformed-input rejection, output-length bound under adversarial inputs, and rapid-driven property tests for round-trip on both percent and binary paths.
1 parent 19c713d commit 25f5da3

2 files changed

Lines changed: 571 additions & 0 deletions

File tree

internal/backup/filename.go

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
// Package backup implements the per-adapter logical-backup format defined in
2+
// docs/design/2026_04_29_proposed_snapshot_logical_decoder.md (Phase 0) and
3+
// reused by docs/design/2026_04_29_proposed_logical_backup.md (Phase 1).
4+
//
5+
// This file owns the filename encoding rules for non-S3 segments. S3 object
6+
// keys preserve their `/` separators (and so are not transformed by EncodeSegment);
7+
// every other adapter scope encodes user-supplied bytes through this path.
8+
//
9+
// Encoding rules (see "Filename encoding" in the Phase 0 doc):
10+
//
11+
// - Bytes in the unreserved set [A-Za-z0-9._-] pass through.
12+
// - Every other byte is rendered as %HH (uppercase hex), like
13+
// application/x-www-form-urlencoded but applied to every non-allowlisted byte.
14+
// - If the encoded result exceeds maxSegmentBytes (240), the segment is
15+
// replaced with <sha256-hex-prefix-32>__<truncated-original> and the full
16+
// original bytes must be recorded in KEYMAP.jsonl by the caller.
17+
// - Binary DynamoDB partition / sort keys take a separate "b64.<base64url>"
18+
// path so a binary key never collides with a string key whose hex encoding
19+
// happens to look like base64. EncodeBinarySegment emits that form.
20+
package backup
21+
22+
import (
23+
"crypto/sha256"
24+
"encoding/base64"
25+
"encoding/hex"
26+
"strings"
27+
28+
"github.com/cockroachdb/errors"
29+
)
30+
31+
const (
32+
// maxSegmentBytes is the maximum length of a single encoded path segment
33+
// before the SHA-fallback kicks in. Chosen to leave headroom under the
34+
// common NAME_MAX of 255: two-character percent escapes can grow a 240-byte
35+
// raw segment to 720 encoded bytes in the worst case, but any segment
36+
// large enough to overflow NAME_MAX after expansion takes the SHA-fallback
37+
// path before the encoded length is examined.
38+
maxSegmentBytes = 240
39+
40+
// shaFallbackHexPrefixBytes is the number of hex characters of SHA-256
41+
// embedded in the SHA-fallback prefix. 32 hex chars == 128 bits of
42+
// hash-prefix entropy — enough to make accidental collision negligible
43+
// for any single scope.
44+
shaFallbackHexPrefixBytes = 32
45+
46+
// shaFallbackTruncatedSuffixBytes is the number of leading bytes of the
47+
// raw segment retained (after percent-encoding) in the SHA-fallback
48+
// rendering. Total encoded segment is then at most:
49+
//
50+
// shaFallbackHexPrefixBytes + len("__") + 3*shaFallbackTruncatedSuffixBytes
51+
//
52+
// = 32 + 2 + 3*64 = 226 bytes (under the 240 ceiling).
53+
//
54+
// The truncated suffix is purely a human-recognisability aid; it does
55+
// NOT carry enough information to reverse the original bytes — that is
56+
// what KEYMAP.jsonl is for.
57+
shaFallbackTruncatedSuffixBytes = 64
58+
59+
// binaryPrefix marks a DynamoDB B-attribute segment encoded as base64url.
60+
binaryPrefix = "b64."
61+
62+
// shaFallbackSeparator separates the SHA-256 prefix from the truncated
63+
// original bytes. Two underscores rather than one because single
64+
// underscores are common in user keys; doubled is much rarer and so
65+
// the boundary is unambiguous.
66+
shaFallbackSeparator = "__"
67+
)
68+
69+
// ErrInvalidEncodedSegment is returned by DecodeSegment when its input is
70+
// neither a valid percent-encoded segment, a binary-prefixed segment, nor a
71+
// SHA-fallback segment.
72+
var ErrInvalidEncodedSegment = errors.New("backup: invalid encoded filename segment")
73+
74+
// ErrShaFallbackNeedsKeymap is returned by DecodeSegment when its input is a
75+
// SHA-fallback segment. The segment cannot be reversed to its original bytes
76+
// from the filename alone — the caller must consult KEYMAP.jsonl.
77+
var ErrShaFallbackNeedsKeymap = errors.New("backup: filename uses SHA fallback; consult KEYMAP.jsonl")
78+
79+
// EncodeSegment encodes a single user-supplied path segment for use as a
80+
// filename component. It is the inverse of DecodeSegment for non-fallback
81+
// inputs.
82+
//
83+
// The encoding is deterministic and idempotent given the same input.
84+
func EncodeSegment(raw []byte) string {
85+
encoded := percentEncode(raw)
86+
if len(encoded) <= maxSegmentBytes {
87+
return encoded
88+
}
89+
return shaFallback(raw)
90+
}
91+
92+
// EncodeBinarySegment encodes a DynamoDB B-attribute (binary) segment as
93+
// "b64.<base64url-no-padding>" so that binary keys never collide with string
94+
// keys whose hex-encoding happens to look like base64.
95+
//
96+
// b64-encoded segments take the SHA fallback if they exceed maxSegmentBytes
97+
// after the base64 expansion (~4/3 of the raw length).
98+
func EncodeBinarySegment(raw []byte) string {
99+
enc := binaryPrefix + base64.RawURLEncoding.EncodeToString(raw)
100+
if len(enc) <= maxSegmentBytes {
101+
return enc
102+
}
103+
return shaFallback(raw)
104+
}
105+
106+
// DecodeSegment is the inverse of EncodeSegment for percent-encoded and
107+
// binary-prefixed inputs. SHA-fallback inputs return ErrShaFallbackNeedsKeymap
108+
// so the caller knows to consult KEYMAP.jsonl rather than treat the partial
109+
// suffix as the original key.
110+
func DecodeSegment(seg string) ([]byte, error) {
111+
if isShaFallback(seg) {
112+
return nil, errors.WithStack(ErrShaFallbackNeedsKeymap)
113+
}
114+
if strings.HasPrefix(seg, binaryPrefix) {
115+
raw, err := base64.RawURLEncoding.DecodeString(seg[len(binaryPrefix):])
116+
if err != nil {
117+
return nil, errors.Wrap(ErrInvalidEncodedSegment, err.Error())
118+
}
119+
return raw, nil
120+
}
121+
return percentDecode(seg)
122+
}
123+
124+
// IsShaFallback reports whether seg uses the SHA-prefix-and-truncated-original
125+
// form. Such segments cannot be reversed without KEYMAP.jsonl.
126+
func IsShaFallback(seg string) bool {
127+
return isShaFallback(seg)
128+
}
129+
130+
// IsBinarySegment reports whether seg is a base64-url encoded binary segment
131+
// emitted by EncodeBinarySegment.
132+
func IsBinarySegment(seg string) bool {
133+
return strings.HasPrefix(seg, binaryPrefix)
134+
}
135+
136+
func percentEncode(raw []byte) string {
137+
// Worst case: every byte expands to %HH (3 bytes). Pre-allocate.
138+
var b strings.Builder
139+
b.Grow(len(raw) * 3) //nolint:mnd // 3 == len("%HH"), local idiom
140+
for _, c := range raw {
141+
if isUnreserved(c) {
142+
b.WriteByte(c)
143+
continue
144+
}
145+
b.WriteByte('%')
146+
b.WriteByte(hexUpper(c >> 4)) //nolint:mnd // 4 == nibble width
147+
b.WriteByte(hexUpper(c & 0x0F)) //nolint:mnd // 0x0F == low-nibble mask
148+
}
149+
return b.String()
150+
}
151+
152+
func percentDecode(seg string) ([]byte, error) {
153+
out := make([]byte, 0, len(seg))
154+
for i := 0; i < len(seg); i++ {
155+
c := seg[i]
156+
if c != '%' {
157+
if !isUnreserved(c) {
158+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
159+
"unexpected raw byte 0x%02x at offset %d", c, i)
160+
}
161+
out = append(out, c)
162+
continue
163+
}
164+
if i+2 >= len(seg) { //nolint:mnd // 2 == hex digit count after %
165+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
166+
"truncated percent escape at offset %d", i)
167+
}
168+
const (
169+
hiNibbleOff = 1
170+
loNibbleOff = 2
171+
)
172+
hi, ok := unhex(seg[i+hiNibbleOff])
173+
if !ok {
174+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
175+
"non-hex digit 0x%02x at offset %d", seg[i+hiNibbleOff], i+hiNibbleOff)
176+
}
177+
lo, ok := unhex(seg[i+loNibbleOff])
178+
if !ok {
179+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
180+
"non-hex digit 0x%02x at offset %d", seg[i+loNibbleOff], i+loNibbleOff)
181+
}
182+
out = append(out, (hi<<4)|lo) //nolint:mnd // 4 == nibble width
183+
i += loNibbleOff // skip the two consumed hex digits
184+
}
185+
return out, nil
186+
}
187+
188+
func shaFallback(raw []byte) string {
189+
sum := sha256.Sum256(raw)
190+
hashHex := hex.EncodeToString(sum[:])[:shaFallbackHexPrefixBytes]
191+
suffix := raw
192+
if len(suffix) > shaFallbackTruncatedSuffixBytes {
193+
suffix = suffix[:shaFallbackTruncatedSuffixBytes]
194+
}
195+
return hashHex + shaFallbackSeparator + percentEncode(suffix)
196+
}
197+
198+
func isShaFallback(seg string) bool {
199+
if len(seg) < shaFallbackHexPrefixBytes+len(shaFallbackSeparator) {
200+
return false
201+
}
202+
for i := 0; i < shaFallbackHexPrefixBytes; i++ {
203+
if _, ok := unhex(seg[i]); !ok {
204+
return false
205+
}
206+
}
207+
return seg[shaFallbackHexPrefixBytes:shaFallbackHexPrefixBytes+len(shaFallbackSeparator)] == shaFallbackSeparator
208+
}
209+
210+
// isUnreserved is the RFC3986 unreserved set: ALPHA / DIGIT / "-" / "." / "_".
211+
// "~" is excluded because it has caused interop problems with older shells and
212+
// the additional safety is not worth the rare benefit.
213+
func isUnreserved(c byte) bool {
214+
switch {
215+
case c >= 'A' && c <= 'Z':
216+
return true
217+
case c >= 'a' && c <= 'z':
218+
return true
219+
case c >= '0' && c <= '9':
220+
return true
221+
case c == '-', c == '.', c == '_':
222+
return true
223+
}
224+
return false
225+
}
226+
227+
func hexUpper(nibble byte) byte {
228+
if nibble < 10 { //nolint:mnd // 10 == decimal/hex boundary
229+
return '0' + nibble
230+
}
231+
return 'A' + (nibble - 10) //nolint:mnd // 10 == decimal/hex boundary
232+
}
233+
234+
func unhex(c byte) (byte, bool) {
235+
switch {
236+
case c >= '0' && c <= '9':
237+
return c - '0', true
238+
case c >= 'a' && c <= 'f':
239+
return c - 'a' + 10, true //nolint:mnd // 10 == decimal/hex boundary
240+
case c >= 'A' && c <= 'F':
241+
return c - 'A' + 10, true //nolint:mnd // 10 == decimal/hex boundary
242+
}
243+
return 0, false
244+
}

0 commit comments

Comments
 (0)