|
| 1 | +package backup |
| 2 | + |
| 3 | +import ( |
| 4 | + "bufio" |
| 5 | + "encoding/base64" |
| 6 | + "encoding/json" |
| 7 | + "io" |
| 8 | + |
| 9 | + "github.com/cockroachdb/errors" |
| 10 | +) |
| 11 | + |
| 12 | +// KEYMAP.jsonl shape (one record per line): |
| 13 | +// |
| 14 | +// {"encoded":"<encoded-segment>","original":"<base64url-no-padding>","kind":"sha-fallback"} |
| 15 | +// |
| 16 | +// Records are written in encounter order (the order the encoder produced |
| 17 | +// them) and never modified after write. The file is append-only; if the same |
| 18 | +// encoded segment is written twice the reader keeps the last entry, but the |
| 19 | +// encoder is expected not to emit duplicates within a single dump. |
| 20 | +// |
| 21 | +// Records exist only for entries whose original bytes are NOT recoverable |
| 22 | +// from the encoded filename alone: |
| 23 | +// |
| 24 | +// - KindSHAFallback — segment is `<sha-prefix-32>__<truncated-original>` |
| 25 | +// (filename length exceeded EncodeSegment's 240-byte ceiling). |
| 26 | +// - KindS3LeafData — S3 object renamed to `<obj>.elastickv-leaf-data` |
| 27 | +// because both `<obj>` and `<obj>/...` existed in the same bucket. |
| 28 | +// - KindMetaCollision — user S3 object key happened to end in |
| 29 | +// `.elastickv-meta.json`; renamed under --rename-collisions. |
| 30 | +// |
| 31 | +// A consumer that does not care about reversing these to original bytes can |
| 32 | +// ignore KEYMAP.jsonl entirely. |
| 33 | +const ( |
| 34 | + KindSHAFallback = "sha-fallback" |
| 35 | + KindS3LeafData = "s3-leaf-data" |
| 36 | + KindMetaCollision = "meta-suffix-rename" |
| 37 | +) |
| 38 | + |
| 39 | +// keymapBufSizeWriter is the bufio.Writer buffer size for the JSONL writer. |
| 40 | +// 64 KiB amortises the per-syscall cost across hundreds of small records |
| 41 | +// without holding pathological amounts of memory. |
| 42 | +const keymapBufSizeWriter = 64 << 10 |
| 43 | + |
| 44 | +// keymapBufSizeReader bounds bufio.Scanner's per-line buffer. KEYMAP records |
| 45 | +// carry a ~240-byte encoded segment plus a base64'd original key (which can |
| 46 | +// itself be arbitrarily large but is bounded by the practical maximum key |
| 47 | +// size on the source store). 1 MiB per line is generous; if a record |
| 48 | +// genuinely exceeds it the reader returns a typed error rather than |
| 49 | +// silently truncating. |
| 50 | +const keymapBufSizeReader = 1 << 20 |
| 51 | + |
| 52 | +// ErrInvalidKeymapRecord is returned by Reader.Next when a line does not |
| 53 | +// parse as a KeymapRecord (malformed JSON, missing field, malformed |
| 54 | +// base64, etc.). |
| 55 | +var ErrInvalidKeymapRecord = errors.New("backup: invalid KEYMAP.jsonl record") |
| 56 | + |
| 57 | +// KeymapRecord is a single mapping from encoded filename component back to |
| 58 | +// the original key bytes. Original bytes are arbitrary (binary safe), so |
| 59 | +// they are encoded as base64url-no-padding for transport in JSON. |
| 60 | +type KeymapRecord struct { |
| 61 | + // Encoded is the filename segment as it appears in the dump tree. |
| 62 | + Encoded string `json:"encoded"` |
| 63 | + // OriginalB64 is base64url-no-padding of the original key bytes. |
| 64 | + OriginalB64 string `json:"original"` |
| 65 | + // Kind classifies why this record exists; see Kind* constants. |
| 66 | + Kind string `json:"kind"` |
| 67 | +} |
| 68 | + |
| 69 | +// Original returns the decoded original key bytes from r.OriginalB64. |
| 70 | +func (r KeymapRecord) Original() ([]byte, error) { |
| 71 | + out, err := base64.RawURLEncoding.DecodeString(r.OriginalB64) |
| 72 | + if err != nil { |
| 73 | + return nil, errors.Wrap(ErrInvalidKeymapRecord, err.Error()) |
| 74 | + } |
| 75 | + return out, nil |
| 76 | +} |
| 77 | + |
| 78 | +// KeymapWriter appends records to a KEYMAP.jsonl stream. Concurrent calls to |
| 79 | +// Write are serialised through the underlying bufio.Writer; the caller is |
| 80 | +// expected to use a single writer per scope. |
| 81 | +type KeymapWriter struct { |
| 82 | + bw *bufio.Writer |
| 83 | + enc *json.Encoder |
| 84 | + // count tracks how many records have been written; exposed so the caller |
| 85 | + // can decide to omit an empty KEYMAP.jsonl file (per the spec, the file |
| 86 | + // is omitted when no entries exist). |
| 87 | + count int |
| 88 | +} |
| 89 | + |
| 90 | +// NewKeymapWriter returns a writer that appends JSONL records to w. Close |
| 91 | +// must be called to flush. |
| 92 | +func NewKeymapWriter(w io.Writer) *KeymapWriter { |
| 93 | + bw := bufio.NewWriterSize(w, keymapBufSizeWriter) |
| 94 | + enc := json.NewEncoder(bw) |
| 95 | + enc.SetEscapeHTML(false) // we never embed user keys in HTML; preserve `<>&` |
| 96 | + return &KeymapWriter{bw: bw, enc: enc} |
| 97 | +} |
| 98 | + |
| 99 | +// Write appends one KeymapRecord. The record is JSON-serialised with a |
| 100 | +// trailing newline (json.Encoder behavior), giving the JSONL contract. |
| 101 | +func (w *KeymapWriter) Write(rec KeymapRecord) error { |
| 102 | + if rec.Encoded == "" { |
| 103 | + return errors.WithStack(errors.New("backup: KEYMAP record encoded must be non-empty")) |
| 104 | + } |
| 105 | + if rec.Kind == "" { |
| 106 | + return errors.WithStack(errors.New("backup: KEYMAP record kind must be non-empty")) |
| 107 | + } |
| 108 | + if err := w.enc.Encode(rec); err != nil { |
| 109 | + return errors.WithStack(err) |
| 110 | + } |
| 111 | + w.count++ |
| 112 | + return nil |
| 113 | +} |
| 114 | + |
| 115 | +// WriteOriginal is a convenience wrapper that base64-encodes raw original |
| 116 | +// bytes for the caller. |
| 117 | +func (w *KeymapWriter) WriteOriginal(encoded string, original []byte, kind string) error { |
| 118 | + return w.Write(KeymapRecord{ |
| 119 | + Encoded: encoded, |
| 120 | + OriginalB64: base64.RawURLEncoding.EncodeToString(original), |
| 121 | + Kind: kind, |
| 122 | + }) |
| 123 | +} |
| 124 | + |
| 125 | +// Count returns the number of records written so far. Useful for the |
| 126 | +// "omit empty KEYMAP file" decision after the dump completes. |
| 127 | +func (w *KeymapWriter) Count() int { return w.count } |
| 128 | + |
| 129 | +// Close flushes any buffered records to the underlying writer. |
| 130 | +func (w *KeymapWriter) Close() error { |
| 131 | + if w.bw == nil { |
| 132 | + return nil |
| 133 | + } |
| 134 | + if err := w.bw.Flush(); err != nil { |
| 135 | + return errors.WithStack(err) |
| 136 | + } |
| 137 | + return nil |
| 138 | +} |
| 139 | + |
| 140 | +// KeymapReader iterates JSONL records line-by-line. Memory footprint is |
| 141 | +// bounded by keymapBufSizeReader regardless of file size. |
| 142 | +type KeymapReader struct { |
| 143 | + sc *bufio.Scanner |
| 144 | + err error |
| 145 | +} |
| 146 | + |
| 147 | +// NewKeymapReader wraps r so the caller can iterate records via Next. |
| 148 | +func NewKeymapReader(r io.Reader) *KeymapReader { |
| 149 | + sc := bufio.NewScanner(r) |
| 150 | + sc.Buffer(make([]byte, 0, keymapBufSizeReader), keymapBufSizeReader) |
| 151 | + return &KeymapReader{sc: sc} |
| 152 | +} |
| 153 | + |
| 154 | +// Next decodes the next record. It returns (rec, true, nil) on success, |
| 155 | +// (zero, false, nil) at end of stream, and (zero, false, err) on parse |
| 156 | +// failure or I/O error. Once an error is returned the reader is sticky: |
| 157 | +// subsequent calls return the same error. |
| 158 | +func (r *KeymapReader) Next() (KeymapRecord, bool, error) { |
| 159 | + if r.err != nil { |
| 160 | + return KeymapRecord{}, false, r.err |
| 161 | + } |
| 162 | + if !r.sc.Scan() { |
| 163 | + if err := r.sc.Err(); err != nil { |
| 164 | + r.err = errors.WithStack(err) |
| 165 | + return KeymapRecord{}, false, r.err |
| 166 | + } |
| 167 | + return KeymapRecord{}, false, nil |
| 168 | + } |
| 169 | + line := r.sc.Bytes() |
| 170 | + var rec KeymapRecord |
| 171 | + if err := json.Unmarshal(line, &rec); err != nil { |
| 172 | + r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error()) |
| 173 | + return KeymapRecord{}, false, r.err |
| 174 | + } |
| 175 | + if rec.Encoded == "" || rec.Kind == "" { |
| 176 | + r.err = errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind") |
| 177 | + return KeymapRecord{}, false, r.err |
| 178 | + } |
| 179 | + return rec, true, nil |
| 180 | +} |
| 181 | + |
| 182 | +// LoadKeymap reads every record from r into an in-memory map keyed by |
| 183 | +// encoded segment. The last record wins on duplicates. Suitable for |
| 184 | +// scopes where the keymap fits comfortably in memory; for large scopes |
| 185 | +// callers should use KeymapReader directly. |
| 186 | +func LoadKeymap(r io.Reader) (map[string]KeymapRecord, error) { |
| 187 | + out := make(map[string]KeymapRecord) |
| 188 | + rd := NewKeymapReader(r) |
| 189 | + for { |
| 190 | + rec, ok, err := rd.Next() |
| 191 | + if err != nil { |
| 192 | + return nil, err |
| 193 | + } |
| 194 | + if !ok { |
| 195 | + return out, nil |
| 196 | + } |
| 197 | + out[rec.Encoded] = rec |
| 198 | + } |
| 199 | +} |
0 commit comments