Skip to content

Commit 3022b92

Browse files
committed
Merge branch 'feat/backup-phase0a-filename' of github.com:bootjp/elastickv into feat/backup-phase0a-keymap-manifest
2 parents a0d9244 + e7a84eb commit 3022b92

2 files changed

Lines changed: 109 additions & 8 deletions

File tree

internal/backup/filename.go

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,33 +81,64 @@ var ErrShaFallbackNeedsKeymap = errors.New("backup: filename uses SHA fallback;
8181
// inputs.
8282
//
8383
// The encoding is deterministic and idempotent given the same input.
84+
//
85+
// Two short-circuits ensure the encoder never trips its own invariants:
86+
//
87+
// - If raw is so large that percent-encoding it would always overflow
88+
// maxSegmentBytes (3*len(raw) > maxSegmentBytes), we go straight to
89+
// shaFallback without allocating the full expansion. Without this an
90+
// adversarial caller could force a very large transient allocation
91+
// just to discard it.
92+
// - If the percent-encoded form happens to match the SHA-fallback shape
93+
// (32 hex chars followed by "__"), we promote it to a real
94+
// SHA-fallback so DecodeSegment's structural detection cannot
95+
// misclassify a legitimate key. Both isShaFallback and shaFallback
96+
// are true on the resulting output, so KEYMAP.jsonl carries the
97+
// original bytes for exact-byte recovery.
8498
func EncodeSegment(raw []byte) string {
99+
if len(raw)*percentEncodeMaxExpansion > maxSegmentBytes {
100+
return shaFallback(raw)
101+
}
85102
encoded := percentEncode(raw)
86-
if len(encoded) <= maxSegmentBytes {
87-
return encoded
103+
if len(encoded) > maxSegmentBytes || isShaFallback(encoded) {
104+
return shaFallback(raw)
88105
}
89-
return shaFallback(raw)
106+
return encoded
90107
}
91108

92109
// EncodeBinarySegment encodes a DynamoDB B-attribute (binary) segment as
93110
// "b64.<base64url-no-padding>" so that binary keys never collide with string
94111
// keys whose hex-encoding happens to look like base64.
95112
//
96-
// b64-encoded segments take the SHA fallback if they exceed maxSegmentBytes
97-
// after the base64 expansion (~4/3 of the raw length).
113+
// Short-circuits the SHA-fallback for inputs whose base64 expansion (~4/3 of
114+
// the raw length, plus the 4-byte "b64." prefix) would always overflow
115+
// maxSegmentBytes. As with EncodeSegment, this avoids an unnecessary large
116+
// allocation when the result would have been discarded anyway.
98117
func EncodeBinarySegment(raw []byte) string {
118+
if base64.RawURLEncoding.EncodedLen(len(raw))+len(binaryPrefix) > maxSegmentBytes {
119+
return shaFallback(raw)
120+
}
99121
enc := binaryPrefix + base64.RawURLEncoding.EncodeToString(raw)
100-
if len(enc) <= maxSegmentBytes {
101-
return enc
122+
if len(enc) > maxSegmentBytes {
123+
return shaFallback(raw)
102124
}
103-
return shaFallback(raw)
125+
return enc
104126
}
105127

106128
// DecodeSegment is the inverse of EncodeSegment for percent-encoded and
107129
// binary-prefixed inputs. SHA-fallback inputs return ErrShaFallbackNeedsKeymap
108130
// so the caller knows to consult KEYMAP.jsonl rather than treat the partial
109131
// suffix as the original key.
132+
//
133+
// As a defensive measure DecodeSegment refuses inputs longer than
134+
// maxSegmentBytes. EncodeSegment never produces such inputs, so any caller
135+
// passing one is either reading a corrupted dump or has a bug; either way the
136+
// percentDecode allocation should not run.
110137
func DecodeSegment(seg string) ([]byte, error) {
138+
if len(seg) > maxSegmentBytes {
139+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
140+
"segment length %d exceeds maximum %d", len(seg), maxSegmentBytes)
141+
}
111142
if isShaFallback(seg) {
112143
return nil, errors.WithStack(ErrShaFallbackNeedsKeymap)
113144
}
@@ -121,6 +152,10 @@ func DecodeSegment(seg string) ([]byte, error) {
121152
return percentDecode(seg)
122153
}
123154

155+
// percentEncodeMaxExpansion is the worst-case ratio of encoded length to
156+
// raw length for percentEncode (every byte expands to "%HH").
157+
const percentEncodeMaxExpansion = 3
158+
124159
// IsShaFallback reports whether seg uses the SHA-prefix-and-truncated-original
125160
// form. Such segments cannot be reversed without KEYMAP.jsonl.
126161
func IsShaFallback(seg string) bool {

internal/backup/filename_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,72 @@ func TestEncodeBinarySegment_FuzzRoundTripIfNotShaFallback(t *testing.T) {
309309
})
310310
}
311311

312+
func TestEncodeSegment_KeyMatchingShaFallbackShapeIsPromotedToFallback(t *testing.T) {
313+
t.Parallel()
314+
// A user key that is itself made of 32 hex chars + "__" + suffix
315+
// would, under naive encoding, return the raw bytes unchanged
316+
// (everything is unreserved) — but DecodeSegment's structural
317+
// detection would then misclassify it as a SHA-fallback and
318+
// return ErrShaFallbackNeedsKeymap. EncodeSegment must promote
319+
// such inputs to a real SHA-fallback so the encoded->decoded
320+
// invariant holds (decode refuses; KEYMAP carries the original).
321+
raw := []byte("0123456789abcdef0123456789abcdef__suffix")
322+
enc := EncodeSegment(raw)
323+
if !IsShaFallback(enc) {
324+
t.Fatalf("expected SHA fallback for collision-shaped input, got %q", enc)
325+
}
326+
// The fallback's hex prefix must be the SHA of the raw bytes,
327+
// NOT the raw bytes' first 32 chars. That way a KEYMAP entry
328+
// keyed on `enc` carries the actual original — not a structural
329+
// echo.
330+
if _, err := DecodeSegment(enc); !errors.Is(err, ErrShaFallbackNeedsKeymap) {
331+
t.Fatalf("decode of promoted fallback: err=%v want ErrShaFallbackNeedsKeymap", err)
332+
}
333+
}
334+
335+
func TestEncodeSegment_HugeInputDoesNotMaterialiseFullExpansion(t *testing.T) {
336+
t.Parallel()
337+
// A 1 MiB input would, if percent-encoded eagerly, allocate 3
338+
// MiB before the length check fired. The early short-circuit
339+
// must skip that allocation. We can't directly observe the
340+
// allocation here without a profile, but we can assert the
341+
// output is correct (SHA fallback, length under the ceiling)
342+
// and that the call returns promptly enough to be a no-op
343+
// guard in profile-runs.
344+
raw := make([]byte, 1<<20) // 1 MiB
345+
for i := range raw {
346+
raw[i] = byte(i)
347+
}
348+
enc := EncodeSegment(raw)
349+
if !IsShaFallback(enc) {
350+
t.Fatalf("expected SHA fallback for huge input")
351+
}
352+
if len(enc) > maxSegmentBytes {
353+
t.Fatalf("encoded len %d > max %d", len(enc), maxSegmentBytes)
354+
}
355+
}
356+
357+
func TestDecodeSegment_RejectsOversizedInput(t *testing.T) {
358+
t.Parallel()
359+
too := strings.Repeat("a", maxSegmentBytes+1)
360+
_, err := DecodeSegment(too)
361+
if !errors.Is(err, ErrInvalidEncodedSegment) {
362+
t.Fatalf("err=%v want ErrInvalidEncodedSegment for oversized input", err)
363+
}
364+
}
365+
366+
func TestEncodeBinarySegment_HugeInputTakesShaFallbackWithoutEncoding(t *testing.T) {
367+
t.Parallel()
368+
raw := make([]byte, 1<<20) // 1 MiB
369+
enc := EncodeBinarySegment(raw)
370+
if !IsShaFallback(enc) {
371+
t.Fatalf("expected SHA fallback for huge binary input, got %q", enc[:min(40, len(enc))])
372+
}
373+
if len(enc) > maxSegmentBytes {
374+
t.Fatalf("encoded len %d > max %d", len(enc), maxSegmentBytes)
375+
}
376+
}
377+
312378
func TestEncodeSegment_ShaFallbackEmbedsRecognisableSuffix(t *testing.T) {
313379
t.Parallel()
314380
// The truncated suffix in the SHA-fallback rendering must be derivable

0 commit comments

Comments
 (0)