Skip to content

Commit 2cd58a9

Browse files
committed
backup: enforce field presence + version-first decode (PR #712, round 5)
Codex P2 round 5 raised two correctness issues: 1. KeymapReader.Next accepted records missing the `original` field because base64.RawURLEncoding.DecodeString("") succeeds. A corrupted line dropping `original` silently rewrote the encoded->original mapping to empty bytes, breaking exact key recovery for SHA-fallback / collision-renamed entries. Now the reader decodes into a presence-aware map first and rejects records missing any of {encoded, original, kind}. Explicit empty string remains valid. 2. ReadManifest decoded into Manifest before applying the format_version gate. A future-major manifest that also changed the JSON type of a known field (e.g. `phase` string -> int) was surfacing as ErrInvalidManifest instead of ErrUnsupportedFormatVersion, breaking the advertised version-branching contract. Now ReadManifest probes format_version with a relaxed shape first, branches on the result, and only runs the strict struct decode on a known-supported version. Tests: TestKeymapReader_RejectsMissingOriginalField, TestKeymapReader_AcceptsExplicitEmptyOriginal, TestReadManifest_FutureMajorVersionTakesPrecedenceOverTypeMismatch.
1 parent 7364133 commit 2cd58a9

4 files changed

Lines changed: 139 additions & 16 deletions

File tree

internal/backup/keymap.go

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,20 +172,47 @@ func (r *KeymapReader) Next() (KeymapRecord, bool, error) {
172172
return KeymapRecord{}, false, nil
173173
}
174174
line := r.sc.Bytes()
175+
rec, err := decodeKeymapLine(line)
176+
if err != nil {
177+
r.err = err
178+
return KeymapRecord{}, false, r.err
179+
}
180+
return rec, true, nil
181+
}
182+
183+
// decodeKeymapLine parses one JSONL record. It enforces three properties:
184+
//
185+
// 1. The record must contain `encoded`, `original`, and `kind` fields —
186+
// a missing `original` would otherwise be silently rewritten to empty
187+
// bytes by base64.RawURLEncoding.DecodeString(""). Codex P2 round 5.
188+
// 2. `encoded` and `kind` must be non-empty strings.
189+
// 3. `original` (the base64) must be parseable at parse time so a
190+
// corrupted dump fails on first read rather than at later
191+
// Original() call. Codex P1 #179.
192+
func decodeKeymapLine(line []byte) (KeymapRecord, error) {
193+
// Two-phase decode: first into a presence-aware map so we can
194+
// distinguish "field absent" from "field present and empty
195+
// string"; then into the typed struct for value extraction.
196+
var fields map[string]json.RawMessage
197+
if err := json.Unmarshal(line, &fields); err != nil {
198+
return KeymapRecord{}, errors.Wrap(ErrInvalidKeymapRecord, err.Error())
199+
}
200+
for _, name := range [...]string{"encoded", "original", "kind"} {
201+
if _, ok := fields[name]; !ok {
202+
return KeymapRecord{}, errors.Wrapf(ErrInvalidKeymapRecord, "missing field %q", name)
203+
}
204+
}
175205
var rec KeymapRecord
176206
if err := json.Unmarshal(line, &rec); err != nil {
177-
r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error())
178-
return KeymapRecord{}, false, r.err
207+
return KeymapRecord{}, errors.Wrap(ErrInvalidKeymapRecord, err.Error())
179208
}
180209
if rec.Encoded == "" || rec.Kind == "" {
181-
r.err = errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind")
182-
return KeymapRecord{}, false, r.err
210+
return KeymapRecord{}, errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind")
183211
}
184212
if _, err := base64.RawURLEncoding.DecodeString(rec.OriginalB64); err != nil {
185-
r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error())
186-
return KeymapRecord{}, false, r.err
213+
return KeymapRecord{}, errors.Wrap(ErrInvalidKeymapRecord, err.Error())
187214
}
188-
return rec, true, nil
215+
return rec, nil
189216
}
190217

191218
// LoadKeymap reads every record from r into an in-memory map keyed by

internal/backup/keymap_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,47 @@ func TestKeymapReader_RejectsMalformedBase64AtParseTime(t *testing.T) {
217217
t.Fatalf("err=%v want ErrInvalidKeymapRecord on parse-time base64 validation", err)
218218
}
219219
}
220+
221+
// TestKeymapReader_RejectsMissingOriginalField exercises Codex P2 round 5:
222+
// a record that omits `original` entirely must not be accepted as if the
223+
// original key were empty bytes, because base64.DecodeString("") succeeds
224+
// silently. A truncated dump that drops `original` would otherwise rewrite
225+
// the encoded->original mapping to empty bytes and break exact key recovery
226+
// for SHA-fallback or collision-renamed entries.
227+
func TestKeymapReader_RejectsMissingOriginalField(t *testing.T) {
228+
t.Parallel()
229+
// All structural keys present except `original`. Without the
230+
// presence check this passes, because rec.OriginalB64 defaults to
231+
// "" and base64 decode of "" succeeds.
232+
input := `{"encoded":"x","kind":"sha-fallback"}` + "\n"
233+
r := NewKeymapReader(strings.NewReader(input))
234+
_, _, err := r.Next()
235+
if !errors.Is(err, ErrInvalidKeymapRecord) {
236+
t.Fatalf("err=%v want ErrInvalidKeymapRecord on missing `original` field", err)
237+
}
238+
// Sticky: a subsequent Next must keep returning the same error class.
239+
_, _, err2 := r.Next()
240+
if !errors.Is(err2, ErrInvalidKeymapRecord) {
241+
t.Fatalf("non-sticky error: %v", err2)
242+
}
243+
}
244+
245+
// TestKeymapReader_AcceptsExplicitEmptyOriginal sanity-checks that an
246+
// explicitly-empty `original` (the field is present, value is "") still
247+
// parses. The contract is that absence is rejected, not emptiness.
248+
func TestKeymapReader_AcceptsExplicitEmptyOriginal(t *testing.T) {
249+
t.Parallel()
250+
input := `{"encoded":"x","original":"","kind":"sha-fallback"}` + "\n"
251+
r := NewKeymapReader(strings.NewReader(input))
252+
rec, ok, err := r.Next()
253+
if err != nil || !ok {
254+
t.Fatalf("err=%v ok=%v want a record", err, ok)
255+
}
256+
got, err := rec.Original()
257+
if err != nil {
258+
t.Fatalf("Original(): %v", err)
259+
}
260+
if len(got) != 0 {
261+
t.Fatalf("Original = %q, want empty", got)
262+
}
263+
}

internal/backup/manifest.go

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package backup
22

33
import (
4+
"bytes"
45
"encoding/json"
56
"io"
67
"time"
@@ -210,8 +211,38 @@ func WriteManifest(w io.Writer, m Manifest) error {
210211
// error is wrapped as ErrUnsupportedFormatVersion or ErrInvalidManifest so
211212
// callers can branch on errors.Is.
212213
func ReadManifest(r io.Reader) (Manifest, error) {
214+
// Read the entire payload once so we can pre-decode just the
215+
// format_version before strict struct decoding. Without this
216+
// two-phase approach, a manifest produced by a newer major version
217+
// that also changed the JSON type of a known field (e.g. `phase`
218+
// switched from string to int) would surface as
219+
// ErrInvalidManifest instead of ErrUnsupportedFormatVersion,
220+
// breaking the documented version-branching contract for callers
221+
// that key off errors.Is(err, ErrUnsupportedFormatVersion). See
222+
// Codex P2, round 5.
223+
payload, err := io.ReadAll(r)
224+
if err != nil {
225+
return Manifest{}, errors.Wrap(ErrInvalidManifest, err.Error())
226+
}
227+
// Phase 1: probe format_version with a relaxed shape that tolerates
228+
// arbitrary types on every other field.
229+
var probe struct {
230+
FormatVersion uint32 `json:"format_version"`
231+
}
232+
if err := json.Unmarshal(payload, &probe); err != nil {
233+
return Manifest{}, errors.Wrap(ErrInvalidManifest, err.Error())
234+
}
235+
if probe.FormatVersion == 0 {
236+
return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion,
237+
"format_version is zero")
238+
}
239+
if probe.FormatVersion > CurrentFormatVersion {
240+
return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion,
241+
"format_version %d > current %d (newer producer)", probe.FormatVersion, CurrentFormatVersion)
242+
}
243+
// Phase 2: strict struct decode on a known-supported version.
213244
var m Manifest
214-
dec := json.NewDecoder(r)
245+
dec := json.NewDecoder(bytes.NewReader(payload))
215246
// We intentionally do NOT call DisallowUnknownFields here.
216247
// The format-version contract (Codex P1, follow-up) is:
217248
// - format_version > CurrentFormatVersion -> hard refuse
@@ -237,14 +268,6 @@ func ReadManifest(r io.Reader) (Manifest, error) {
237268
return Manifest{}, errors.Wrap(ErrInvalidManifest,
238269
"trailing bytes after manifest JSON object")
239270
}
240-
if m.FormatVersion == 0 {
241-
return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion,
242-
"format_version is zero")
243-
}
244-
if m.FormatVersion > CurrentFormatVersion {
245-
return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion,
246-
"format_version %d > current %d (newer producer)", m.FormatVersion, CurrentFormatVersion)
247-
}
248271
if err := m.validate(); err != nil {
249272
return Manifest{}, err
250273
}

internal/backup/manifest_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,35 @@ func TestReadManifest_RejectsFutureFormatVersion(t *testing.T) {
130130
}
131131
}
132132

133+
// TestReadManifest_FutureMajorVersionTakesPrecedenceOverTypeMismatch is the
134+
// regression test for Codex P2 round 5: a newer-major manifest that also
135+
// changes the JSON type of a known field (e.g. `phase` from string to int)
136+
// must surface as ErrUnsupportedFormatVersion, not ErrInvalidManifest. The
137+
// version-branching contract advertised to callers (errors.Is(err,
138+
// ErrUnsupportedFormatVersion) means "upgrade required") only holds if the
139+
// format_version probe runs before the strict struct decode.
140+
func TestReadManifest_FutureMajorVersionTakesPrecedenceOverTypeMismatch(t *testing.T) {
141+
t.Parallel()
142+
body := `{
143+
"format_version": 999,
144+
"phase": 42,
145+
"wall_time_iso": "2026-04-29T00:00:00Z",
146+
"adapters": {"dynamodb":{}, "s3":{}, "redis":{}, "sqs":{}},
147+
"exclusions": {"include_incomplete_uploads":false,"include_orphans":false,"preserve_sqs_visibility":false,"include_sqs_side_records":false},
148+
"checksum_algorithm": "sha256",
149+
"checksum_format": "sha256sum",
150+
"encoded_filename_charset": "rfc3986-unreserved-plus-percent",
151+
"key_segment_max_bytes": 240,
152+
"s3_meta_suffix": ".elastickv-meta.json",
153+
"s3_collision_strategy": "leaf-data-suffix",
154+
"dynamodb_layout": "per-item"
155+
}`
156+
_, err := ReadManifest(strings.NewReader(body))
157+
if !errors.Is(err, ErrUnsupportedFormatVersion) {
158+
t.Fatalf("err=%v want ErrUnsupportedFormatVersion (must precede strict decode)", err)
159+
}
160+
}
161+
133162
func TestReadManifest_RejectsZeroFormatVersion(t *testing.T) {
134163
t.Parallel()
135164
m := NewPhase0SnapshotManifest(time.Now())

0 commit comments

Comments
 (0)