Skip to content

Commit a0d9244

Browse files
committed
backup: KEYMAP.jsonl writer/reader and MANIFEST.json schema (Phase 0a)
Builds on PR #711 (filename encoding). Adds two more foundation pieces of the Phase 0 logical-backup decoder. KEYMAP.jsonl (internal/backup/keymap.go) - Append-only JSONL stream of {encoded, original (b64url), kind} records. - Records exist only for entries whose original bytes are NOT recoverable from the encoded filename alone: - KindSHAFallback (segments rendered as <sha32>__<truncated>) - KindS3LeafData (S3 path collisions renamed to .elastickv-leaf-data) - KindMetaCollision (user object key ending in .elastickv-meta.json) - KeymapWriter: streaming append, json encoder configured to skip HTML escapes so user-key bytes round-trip cleanly. Refuses empty encoded or kind so producer bugs surface loudly. Count() exposed for the "omit empty file" decision. - KeymapReader: line-by-line scanner with bounded buffer (1 MiB); blank lines surface as ErrInvalidKeymapRecord rather than being silently skipped so truncated dumps are recognised. - LoadKeymap: convenience helper that materialises the file as a map (last-wins on duplicates). MANIFEST.json (internal/backup/manifest.go) - Manifest, Source, Live, Adapters, Adapter, Exclusions structs matching the schema in docs/design/2026_04_29_proposed_snapshot_logical_decoder.md. - CurrentFormatVersion = 1; ReadManifest refuses format_version > current and format_version == 0 (ErrUnsupportedFormatVersion). - Phase discriminator constants for Phase 0 ("phase0-snapshot-decode") and Phase 1 ("phase1-live-pinned"); Phase 0 manifests must not set Live, Phase 1 must not set Source -- both validated at write and read time. - DisallowUnknownFields on read so format drift surfaces loudly. - Pretty-printed output (2-space indent, no HTML escapes) since MANIFEST.json is operator-facing. - NewPhase0SnapshotManifest seeds the policy fields with the documented defaults so callers can focus on populating per-dump metadata. Tests cover round-trip, sticky-error semantics, last-wins dedup, HTML-escape suppression, future-version refusal, unknown-field refusal, unknown-phase refusal, and the cross-phase Source/Live exclusion rules.
1 parent 25f5da3 commit a0d9244

4 files changed

Lines changed: 899 additions & 0 deletions

File tree

internal/backup/keymap.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
package backup
2+
3+
import (
4+
"bufio"
5+
"encoding/base64"
6+
"encoding/json"
7+
"io"
8+
9+
"github.com/cockroachdb/errors"
10+
)
11+
12+
// KEYMAP.jsonl shape (one record per line):
13+
//
14+
// {"encoded":"<encoded-segment>","original":"<base64url-no-padding>","kind":"sha-fallback"}
15+
//
16+
// Records are written in encounter order (the order the encoder produced
17+
// them) and never modified after write. The file is append-only; if the same
18+
// encoded segment is written twice the reader keeps the last entry, but the
19+
// encoder is expected not to emit duplicates within a single dump.
20+
//
21+
// Records exist only for entries whose original bytes are NOT recoverable
22+
// from the encoded filename alone:
23+
//
24+
// - KindSHAFallback — segment is `<sha-prefix-32>__<truncated-original>`
25+
// (filename length exceeded EncodeSegment's 240-byte ceiling).
26+
// - KindS3LeafData — S3 object renamed to `<obj>.elastickv-leaf-data`
27+
// because both `<obj>` and `<obj>/...` existed in the same bucket.
28+
// - KindMetaCollision — user S3 object key happened to end in
29+
// `.elastickv-meta.json`; renamed under --rename-collisions.
30+
//
31+
// A consumer that does not care about reversing these to original bytes can
32+
// ignore KEYMAP.jsonl entirely.
33+
const (
34+
KindSHAFallback = "sha-fallback"
35+
KindS3LeafData = "s3-leaf-data"
36+
KindMetaCollision = "meta-suffix-rename"
37+
)
38+
39+
// keymapBufSizeWriter is the bufio.Writer buffer size for the JSONL writer.
40+
// 64 KiB amortises the per-syscall cost across hundreds of small records
41+
// without holding pathological amounts of memory.
42+
const keymapBufSizeWriter = 64 << 10
43+
44+
// keymapBufSizeReader bounds bufio.Scanner's per-line buffer. KEYMAP records
45+
// carry a ~240-byte encoded segment plus a base64'd original key (which can
46+
// itself be arbitrarily large but is bounded by the practical maximum key
47+
// size on the source store). 1 MiB per line is generous; if a record
48+
// genuinely exceeds it the reader returns a typed error rather than
49+
// silently truncating.
50+
const keymapBufSizeReader = 1 << 20
51+
52+
// ErrInvalidKeymapRecord is returned by Reader.Next when a line does not
53+
// parse as a KeymapRecord (malformed JSON, missing field, malformed
54+
// base64, etc.).
55+
var ErrInvalidKeymapRecord = errors.New("backup: invalid KEYMAP.jsonl record")
56+
57+
// KeymapRecord is a single mapping from encoded filename component back to
58+
// the original key bytes. Original bytes are arbitrary (binary safe), so
59+
// they are encoded as base64url-no-padding for transport in JSON.
60+
type KeymapRecord struct {
61+
// Encoded is the filename segment as it appears in the dump tree.
62+
Encoded string `json:"encoded"`
63+
// OriginalB64 is base64url-no-padding of the original key bytes.
64+
OriginalB64 string `json:"original"`
65+
// Kind classifies why this record exists; see Kind* constants.
66+
Kind string `json:"kind"`
67+
}
68+
69+
// Original returns the decoded original key bytes from r.OriginalB64.
70+
func (r KeymapRecord) Original() ([]byte, error) {
71+
out, err := base64.RawURLEncoding.DecodeString(r.OriginalB64)
72+
if err != nil {
73+
return nil, errors.Wrap(ErrInvalidKeymapRecord, err.Error())
74+
}
75+
return out, nil
76+
}
77+
78+
// KeymapWriter appends records to a KEYMAP.jsonl stream. Concurrent calls to
79+
// Write are serialised through the underlying bufio.Writer; the caller is
80+
// expected to use a single writer per scope.
81+
type KeymapWriter struct {
82+
bw *bufio.Writer
83+
enc *json.Encoder
84+
// count tracks how many records have been written; exposed so the caller
85+
// can decide to omit an empty KEYMAP.jsonl file (per the spec, the file
86+
// is omitted when no entries exist).
87+
count int
88+
}
89+
90+
// NewKeymapWriter returns a writer that appends JSONL records to w. Close
91+
// must be called to flush.
92+
func NewKeymapWriter(w io.Writer) *KeymapWriter {
93+
bw := bufio.NewWriterSize(w, keymapBufSizeWriter)
94+
enc := json.NewEncoder(bw)
95+
enc.SetEscapeHTML(false) // we never embed user keys in HTML; preserve `<>&`
96+
return &KeymapWriter{bw: bw, enc: enc}
97+
}
98+
99+
// Write appends one KeymapRecord. The record is JSON-serialised with a
100+
// trailing newline (json.Encoder behavior), giving the JSONL contract.
101+
func (w *KeymapWriter) Write(rec KeymapRecord) error {
102+
if rec.Encoded == "" {
103+
return errors.WithStack(errors.New("backup: KEYMAP record encoded must be non-empty"))
104+
}
105+
if rec.Kind == "" {
106+
return errors.WithStack(errors.New("backup: KEYMAP record kind must be non-empty"))
107+
}
108+
if err := w.enc.Encode(rec); err != nil {
109+
return errors.WithStack(err)
110+
}
111+
w.count++
112+
return nil
113+
}
114+
115+
// WriteOriginal is a convenience wrapper that base64-encodes raw original
116+
// bytes for the caller.
117+
func (w *KeymapWriter) WriteOriginal(encoded string, original []byte, kind string) error {
118+
return w.Write(KeymapRecord{
119+
Encoded: encoded,
120+
OriginalB64: base64.RawURLEncoding.EncodeToString(original),
121+
Kind: kind,
122+
})
123+
}
124+
125+
// Count returns the number of records written so far. Useful for the
126+
// "omit empty KEYMAP file" decision after the dump completes.
127+
func (w *KeymapWriter) Count() int { return w.count }
128+
129+
// Close flushes any buffered records to the underlying writer.
130+
func (w *KeymapWriter) Close() error {
131+
if w.bw == nil {
132+
return nil
133+
}
134+
if err := w.bw.Flush(); err != nil {
135+
return errors.WithStack(err)
136+
}
137+
return nil
138+
}
139+
140+
// KeymapReader iterates JSONL records line-by-line. Memory footprint is
141+
// bounded by keymapBufSizeReader regardless of file size.
142+
type KeymapReader struct {
143+
sc *bufio.Scanner
144+
err error
145+
}
146+
147+
// NewKeymapReader wraps r so the caller can iterate records via Next.
148+
func NewKeymapReader(r io.Reader) *KeymapReader {
149+
sc := bufio.NewScanner(r)
150+
sc.Buffer(make([]byte, 0, keymapBufSizeReader), keymapBufSizeReader)
151+
return &KeymapReader{sc: sc}
152+
}
153+
154+
// Next decodes the next record. It returns (rec, true, nil) on success,
155+
// (zero, false, nil) at end of stream, and (zero, false, err) on parse
156+
// failure or I/O error. Once an error is returned the reader is sticky:
157+
// subsequent calls return the same error.
158+
func (r *KeymapReader) Next() (KeymapRecord, bool, error) {
159+
if r.err != nil {
160+
return KeymapRecord{}, false, r.err
161+
}
162+
if !r.sc.Scan() {
163+
if err := r.sc.Err(); err != nil {
164+
r.err = errors.WithStack(err)
165+
return KeymapRecord{}, false, r.err
166+
}
167+
return KeymapRecord{}, false, nil
168+
}
169+
line := r.sc.Bytes()
170+
var rec KeymapRecord
171+
if err := json.Unmarshal(line, &rec); err != nil {
172+
r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error())
173+
return KeymapRecord{}, false, r.err
174+
}
175+
if rec.Encoded == "" || rec.Kind == "" {
176+
r.err = errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind")
177+
return KeymapRecord{}, false, r.err
178+
}
179+
return rec, true, nil
180+
}
181+
182+
// LoadKeymap reads every record from r into an in-memory map keyed by
183+
// encoded segment. The last record wins on duplicates. Suitable for
184+
// scopes where the keymap fits comfortably in memory; for large scopes
185+
// callers should use KeymapReader directly.
186+
func LoadKeymap(r io.Reader) (map[string]KeymapRecord, error) {
187+
out := make(map[string]KeymapRecord)
188+
rd := NewKeymapReader(r)
189+
for {
190+
rec, ok, err := rd.Next()
191+
if err != nil {
192+
return nil, err
193+
}
194+
if !ok {
195+
return out, nil
196+
}
197+
out[rec.Encoded] = rec
198+
}
199+
}

0 commit comments

Comments
 (0)