Skip to content

Commit 618f633

Browse files
committed
Merge remote-tracking branch 'origin/feat/backup-phase0a-keymap-manifest' into feat/backup-phase0a-redis-simple
2 parents ebe4f36 + c3d1e6f commit 618f633

6 files changed

Lines changed: 226 additions & 23 deletions

File tree

internal/backup/filename.go

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,33 +81,64 @@ var ErrShaFallbackNeedsKeymap = errors.New("backup: filename uses SHA fallback;
8181
// inputs.
8282
//
8383
// The encoding is deterministic and idempotent given the same input.
84+
//
85+
// Two short-circuits ensure the encoder never trips its own invariants:
86+
//
87+
// - If raw is so large that percent-encoding it would always overflow
88+
// maxSegmentBytes (3*len(raw) > maxSegmentBytes), we go straight to
89+
// shaFallback without allocating the full expansion. Without this an
90+
// adversarial caller could force a very large transient allocation
91+
// just to discard it.
92+
// - If the percent-encoded form happens to match the SHA-fallback shape
93+
// (32 hex chars followed by "__"), we promote it to a real
94+
// SHA-fallback so DecodeSegment's structural detection cannot
95+
// misclassify a legitimate key. Both isShaFallback and shaFallback
96+
// are true on the resulting output, so KEYMAP.jsonl carries the
97+
// original bytes for exact-byte recovery.
8498
func EncodeSegment(raw []byte) string {
99+
if len(raw)*percentEncodeMaxExpansion > maxSegmentBytes {
100+
return shaFallback(raw)
101+
}
85102
encoded := percentEncode(raw)
86-
if len(encoded) <= maxSegmentBytes {
87-
return encoded
103+
if len(encoded) > maxSegmentBytes || isShaFallback(encoded) {
104+
return shaFallback(raw)
88105
}
89-
return shaFallback(raw)
106+
return encoded
90107
}
91108

92109
// EncodeBinarySegment encodes a DynamoDB B-attribute (binary) segment as
93110
// "b64.<base64url-no-padding>" so that binary keys never collide with string
94111
// keys whose hex-encoding happens to look like base64.
95112
//
96-
// b64-encoded segments take the SHA fallback if they exceed maxSegmentBytes
97-
// after the base64 expansion (~4/3 of the raw length).
113+
// Short-circuits the SHA-fallback for inputs whose base64 expansion (~4/3 of
114+
// the raw length, plus the 4-byte "b64." prefix) would always overflow
115+
// maxSegmentBytes. As with EncodeSegment, this avoids an unnecessary large
116+
// allocation when the result would have been discarded anyway.
98117
func EncodeBinarySegment(raw []byte) string {
118+
if base64.RawURLEncoding.EncodedLen(len(raw))+len(binaryPrefix) > maxSegmentBytes {
119+
return shaFallback(raw)
120+
}
99121
enc := binaryPrefix + base64.RawURLEncoding.EncodeToString(raw)
100-
if len(enc) <= maxSegmentBytes {
101-
return enc
122+
if len(enc) > maxSegmentBytes {
123+
return shaFallback(raw)
102124
}
103-
return shaFallback(raw)
125+
return enc
104126
}
105127

106128
// DecodeSegment is the inverse of EncodeSegment for percent-encoded and
107129
// binary-prefixed inputs. SHA-fallback inputs return ErrShaFallbackNeedsKeymap
108130
// so the caller knows to consult KEYMAP.jsonl rather than treat the partial
109131
// suffix as the original key.
132+
//
133+
// As a defensive measure DecodeSegment refuses inputs longer than
134+
// maxSegmentBytes. EncodeSegment never produces such inputs, so any caller
135+
// passing one is either reading a corrupted dump or has a bug; either way the
136+
// percentDecode allocation should not run.
110137
func DecodeSegment(seg string) ([]byte, error) {
138+
if len(seg) > maxSegmentBytes {
139+
return nil, errors.Wrapf(ErrInvalidEncodedSegment,
140+
"segment length %d exceeds maximum %d", len(seg), maxSegmentBytes)
141+
}
111142
if isShaFallback(seg) {
112143
return nil, errors.WithStack(ErrShaFallbackNeedsKeymap)
113144
}
@@ -121,6 +152,10 @@ func DecodeSegment(seg string) ([]byte, error) {
121152
return percentDecode(seg)
122153
}
123154

155+
// percentEncodeMaxExpansion is the worst-case ratio of encoded length to
156+
// raw length for percentEncode (every byte expands to "%HH").
157+
const percentEncodeMaxExpansion = 3
158+
124159
// IsShaFallback reports whether seg uses the SHA-prefix-and-truncated-original
125160
// form. Such segments cannot be reversed without KEYMAP.jsonl.
126161
func IsShaFallback(seg string) bool {

internal/backup/filename_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,72 @@ func TestEncodeBinarySegment_FuzzRoundTripIfNotShaFallback(t *testing.T) {
309309
})
310310
}
311311

312+
func TestEncodeSegment_KeyMatchingShaFallbackShapeIsPromotedToFallback(t *testing.T) {
313+
t.Parallel()
314+
// A user key that is itself made of 32 hex chars + "__" + suffix
315+
// would, under naive encoding, return the raw bytes unchanged
316+
// (everything is unreserved) — but DecodeSegment's structural
317+
// detection would then misclassify it as a SHA-fallback and
318+
// return ErrShaFallbackNeedsKeymap. EncodeSegment must promote
319+
// such inputs to a real SHA-fallback so the encoded->decoded
320+
// invariant holds (decode refuses; KEYMAP carries the original).
321+
raw := []byte("0123456789abcdef0123456789abcdef__suffix")
322+
enc := EncodeSegment(raw)
323+
if !IsShaFallback(enc) {
324+
t.Fatalf("expected SHA fallback for collision-shaped input, got %q", enc)
325+
}
326+
// The fallback's hex prefix must be the SHA of the raw bytes,
327+
// NOT the raw bytes' first 32 chars. That way a KEYMAP entry
328+
// keyed on `enc` carries the actual original — not a structural
329+
// echo.
330+
if _, err := DecodeSegment(enc); !errors.Is(err, ErrShaFallbackNeedsKeymap) {
331+
t.Fatalf("decode of promoted fallback: err=%v want ErrShaFallbackNeedsKeymap", err)
332+
}
333+
}
334+
335+
func TestEncodeSegment_HugeInputDoesNotMaterialiseFullExpansion(t *testing.T) {
336+
t.Parallel()
337+
// A 1 MiB input would, if percent-encoded eagerly, allocate 3
338+
// MiB before the length check fired. The early short-circuit
339+
// must skip that allocation. We can't directly observe the
340+
// allocation here without a profile, but we can assert the
341+
// output is correct (SHA fallback, length under the ceiling)
342+
// and that the call returns promptly enough to be a no-op
343+
// guard in profile-runs.
344+
raw := make([]byte, 1<<20) // 1 MiB
345+
for i := range raw {
346+
raw[i] = byte(i)
347+
}
348+
enc := EncodeSegment(raw)
349+
if !IsShaFallback(enc) {
350+
t.Fatalf("expected SHA fallback for huge input")
351+
}
352+
if len(enc) > maxSegmentBytes {
353+
t.Fatalf("encoded len %d > max %d", len(enc), maxSegmentBytes)
354+
}
355+
}
356+
357+
func TestDecodeSegment_RejectsOversizedInput(t *testing.T) {
358+
t.Parallel()
359+
too := strings.Repeat("a", maxSegmentBytes+1)
360+
_, err := DecodeSegment(too)
361+
if !errors.Is(err, ErrInvalidEncodedSegment) {
362+
t.Fatalf("err=%v want ErrInvalidEncodedSegment for oversized input", err)
363+
}
364+
}
365+
366+
func TestEncodeBinarySegment_HugeInputTakesShaFallbackWithoutEncoding(t *testing.T) {
367+
t.Parallel()
368+
raw := make([]byte, 1<<20) // 1 MiB
369+
enc := EncodeBinarySegment(raw)
370+
if !IsShaFallback(enc) {
371+
t.Fatalf("expected SHA fallback for huge binary input, got %q", enc[:min(40, len(enc))])
372+
}
373+
if len(enc) > maxSegmentBytes {
374+
t.Fatalf("encoded len %d > max %d", len(enc), maxSegmentBytes)
375+
}
376+
}
377+
312378
func TestEncodeSegment_ShaFallbackEmbedsRecognisableSuffix(t *testing.T) {
313379
t.Parallel()
314380
// The truncated suffix in the SHA-fallback rendering must be derivable

internal/backup/keymap.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,11 @@ func NewKeymapReader(r io.Reader) *KeymapReader {
155155
// (zero, false, nil) at end of stream, and (zero, false, err) on parse
156156
// failure or I/O error. Once an error is returned the reader is sticky:
157157
// subsequent calls return the same error.
158+
//
159+
// The base64-encoded `original` field is validated at parse time rather
160+
// than lazily: a malformed dump must surface on the first read of the
161+
// affected line, not propagate silently until a much later
162+
// rec.Original() call. Same error class either way.
158163
func (r *KeymapReader) Next() (KeymapRecord, bool, error) {
159164
if r.err != nil {
160165
return KeymapRecord{}, false, r.err
@@ -176,6 +181,10 @@ func (r *KeymapReader) Next() (KeymapRecord, bool, error) {
176181
r.err = errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind")
177182
return KeymapRecord{}, false, r.err
178183
}
184+
if _, err := base64.RawURLEncoding.DecodeString(rec.OriginalB64); err != nil {
185+
r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error())
186+
return KeymapRecord{}, false, r.err
187+
}
179188
return rec, true, nil
180189
}
181190

internal/backup/keymap_test.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,13 @@ func TestKeymapReader_RejectsRecordWithoutEncodedOrKind(t *testing.T) {
157157
}
158158
}
159159

160-
func TestKeymapReader_AcceptsBlankLinesByPolicy(t *testing.T) {
160+
func TestKeymapReader_RejectsBlankLines(t *testing.T) {
161161
t.Parallel()
162162
// bufio.Scanner skips trailing newline but emits an empty line when one
163163
// is in the middle of the stream. We require strict JSONL — every
164-
// non-empty line must be a record. An empty line in the middle should
165-
// surface as ErrInvalidKeymapRecord rather than silently skipped, so
166-
// truncated dumps are recognised.
164+
// non-empty line must be a record. An empty line in the middle must
165+
// surface as ErrInvalidKeymapRecord rather than be silently skipped,
166+
// so truncated dumps are recognised.
167167
input := `{"encoded":"x","original":"AA","kind":"sha-fallback"}` + "\n\n" +
168168
`{"encoded":"y","original":"AA","kind":"sha-fallback"}` + "\n"
169169
r := NewKeymapReader(strings.NewReader(input))
@@ -203,3 +203,17 @@ func TestKeymapRecord_OriginalRejectsBadBase64(t *testing.T) {
203203
t.Fatalf("err = %v, want ErrInvalidKeymapRecord", err)
204204
}
205205
}
206+
207+
func TestKeymapReader_RejectsMalformedBase64AtParseTime(t *testing.T) {
208+
t.Parallel()
209+
// JSON parses fine; the structural fields are present; only the
210+
// `original` base64 is malformed. The reader must catch this on
211+
// the first Next() rather than defer it to a later Original()
212+
// call — Codex P1 #179.
213+
input := `{"encoded":"x","original":"!!!","kind":"sha-fallback"}` + "\n"
214+
r := NewKeymapReader(strings.NewReader(input))
215+
_, _, err := r.Next()
216+
if !errors.Is(err, ErrInvalidKeymapRecord) {
217+
t.Fatalf("err=%v want ErrInvalidKeymapRecord on parse-time base64 validation", err)
218+
}
219+
}

internal/backup/manifest.go

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,27 @@ type Live struct {
8787
PinTokenSHA256 string `json:"pin_token_sha256,omitempty"`
8888
}
8989

90-
// Adapters lists which scopes were dumped per adapter. An empty slice
91-
// means "no scopes for this adapter were dumped"; a nil slice means
92-
// "this adapter was not in the dump's scope filter."
90+
// Adapters lists which scopes were dumped per adapter. The pointer
91+
// values express two distinguishable on-disk states:
92+
//
93+
// - nil -> the adapter was excluded from this dump (e.g.
94+
// `--adapter dynamodb,s3` filtered it out). The corresponding
95+
// JSON key is absent.
96+
// - non-nil pointer to Adapter{} -> the adapter was in scope but
97+
// no scopes for it were emitted (no tables, no buckets, etc.).
98+
// The JSON key is present with an empty object.
99+
// - non-nil pointer to a populated Adapter -> the listed scopes
100+
// were emitted.
101+
//
102+
// Storing pointers (rather than zero-value Adapter structs) is what
103+
// keeps "excluded by filter" distinguishable from "included but
104+
// empty" through json.Marshal — non-pointer fields would collapse
105+
// both states into the same on-disk shape.
93106
type Adapters struct {
94-
DynamoDB Adapter `json:"dynamodb"`
95-
S3 Adapter `json:"s3"`
96-
Redis Adapter `json:"redis"`
97-
SQS Adapter `json:"sqs"`
107+
DynamoDB *Adapter `json:"dynamodb,omitempty"`
108+
S3 *Adapter `json:"s3,omitempty"`
109+
Redis *Adapter `json:"redis,omitempty"`
110+
SQS *Adapter `json:"sqs,omitempty"`
98111
}
99112

100113
// Adapter holds the scope identifiers for one adapter. Field names are
@@ -193,6 +206,17 @@ func ReadManifest(r io.Reader) (Manifest, error) {
193206
if err := dec.Decode(&m); err != nil {
194207
return Manifest{}, errors.Wrap(ErrInvalidManifest, err.Error())
195208
}
209+
// MANIFEST.json is exactly one JSON object. Trailing bytes
210+
// (a second object, junk, even whitespace-only padding) point at
211+
// concatenation bugs or partial-write corruption — both of which
212+
// must surface here rather than be silently discarded. We use
213+
// io.Discard rather than parsing because we only care that
214+
// nothing-decodable is present; structural validation lives in
215+
// validate().
216+
if dec.More() {
217+
return Manifest{}, errors.Wrap(ErrInvalidManifest,
218+
"trailing bytes after manifest JSON object")
219+
}
196220
if m.FormatVersion == 0 {
197221
return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion,
198222
"format_version is zero")

internal/backup/manifest_test.go

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ func TestManifest_Phase0RoundTrip(t *testing.T) {
2020
m.LastCommitTS = 4517352099840000
2121
m.Source = &Source{FSMPath: "/data/fsm-snap/0000000000000064.fsm", FSMCRC32C: "deadbeef"}
2222
m.Adapters = Adapters{
23-
DynamoDB: Adapter{Tables: []string{"orders", "users"}},
24-
S3: Adapter{Buckets: []string{"photos"}},
25-
Redis: Adapter{Databases: []uint32{0}},
26-
SQS: Adapter{Queues: []string{"orders-fifo.fifo"}},
23+
DynamoDB: &Adapter{Tables: []string{"orders", "users"}},
24+
S3: &Adapter{Buckets: []string{"photos"}},
25+
Redis: &Adapter{Databases: []uint32{0}},
26+
SQS: &Adapter{Queues: []string{"orders-fifo.fifo"}},
2727
}
2828
m.Exclusions = Exclusions{} // all defaults
2929

@@ -205,6 +205,61 @@ func TestNewPhase0SnapshotManifest_DefaultsArePopulated(t *testing.T) {
205205
}
206206
}
207207

208+
func TestReadManifest_RejectsTrailingBytes(t *testing.T) {
209+
t.Parallel()
210+
// Two manifests concatenated; the second must surface as a
211+
// trailing-bytes error rather than be silently discarded — Codex
212+
// P2 #194.
213+
m := NewPhase0SnapshotManifest(time.Now())
214+
body, err := json.Marshal(m)
215+
if err != nil {
216+
t.Fatalf("marshal: %v", err)
217+
}
218+
bad := append([]byte{}, body...)
219+
bad = append(bad, body...)
220+
_, err = ReadManifest(bytes.NewReader(bad))
221+
if !errors.Is(err, ErrInvalidManifest) {
222+
t.Fatalf("err=%v want ErrInvalidManifest on trailing bytes", err)
223+
}
224+
}
225+
226+
func TestReadManifest_RejectsTrailingNonWhitespace(t *testing.T) {
227+
t.Parallel()
228+
m := NewPhase0SnapshotManifest(time.Now())
229+
body, err := json.Marshal(m)
230+
if err != nil {
231+
t.Fatalf("marshal: %v", err)
232+
}
233+
bad := append([]byte{}, body...)
234+
bad = append(bad, []byte("garbage")...)
235+
_, err = ReadManifest(bytes.NewReader(bad))
236+
if !errors.Is(err, ErrInvalidManifest) {
237+
t.Fatalf("err=%v want ErrInvalidManifest on trailing garbage", err)
238+
}
239+
}
240+
241+
func TestAdaptersStruct_NilVsEmptyDistinguishedOnDisk(t *testing.T) {
242+
t.Parallel()
243+
// Gemini #98: an excluded adapter (nil pointer) must serialize
244+
// differently from an included-but-empty adapter (non-nil pointer
245+
// to Adapter{}).
246+
excluded := Adapters{
247+
DynamoDB: &Adapter{}, // present, no scopes
248+
// S3 / Redis / SQS left nil — out of scope
249+
}
250+
body, err := json.Marshal(excluded)
251+
if err != nil {
252+
t.Fatal(err)
253+
}
254+
out := string(body)
255+
if !strings.Contains(out, `"dynamodb":{}`) {
256+
t.Fatalf("included-empty must serialise as `dynamodb:{}`, got %s", out)
257+
}
258+
if strings.Contains(out, `"s3"`) || strings.Contains(out, `"redis"`) || strings.Contains(out, `"sqs"`) {
259+
t.Fatalf("excluded adapters must be omitted, got %s", out)
260+
}
261+
}
262+
208263
func TestWriteManifest_ProducesPrettyJSON(t *testing.T) {
209264
t.Parallel()
210265
m := NewPhase0SnapshotManifest(time.Now())

0 commit comments

Comments
 (0)