Skip to content

Commit cacd448

Browse files
committed
backup: hash field array shape + 32-bit safety + kindByKey centralisation (PR #725, round 1)
Codex round 12 raised two P1 findings on commit 72e2b75; Gemini raised one high and three mediums alongside. P1: Hash field name encoding via JSON object key was lossy. - Distinct fields could collapse to the same JSON key: a UTF-8 literal `%FF` and a single binary byte `0xFF` both ended up as `%FF` after the EncodeSegment-on-non-UTF-8 fallback, silently dropping data on hashes with both forms. - Long binary field names (>240 bytes) hit EncodeSegment's SHA fallback which is irreversible at this layer (no per-field KEYMAP), so restore could not reconstruct the original bytes. Fix: change `fields` to a JSON array of typed `{name, value}` records. Both name and value go through the same UTF-8-or-base64 envelope (marshalRedisBinaryValue, mirrors PR #714's SQS body), so arbitrary binary bytes round-trip without collisions and without SHA fallback. High (Gemini): parseUserKeyLenPrefix used `int(uint32)` which wraps to negative on 32-bit when the high bit is set, bypassing the bounds check and causing a slice panic. Compare in `uint64` space before converting. Medium (Gemini, ×3): centralise `kindByKey` registration in the hashState helper so meta/field/TTL paths agree on the kind in one place. Audited callers — HandleHashMeta, HandleHashField, and HandleTTL's `case redisKindHash:` arm — all reach hashState via the same userKey identity, and the TTL caller already knows the kind so the now-implicit kindByKey write is idempotent. No other callers per `grep -n "r\.hashState(" internal/backup/`. Tests: - TestRedisDB_HashRoundTripBasic updated to assert the array shape via new hashFieldArray / hashFieldByName helpers. - TestRedisDB_HashEmptyHashStillEmitsFile, _BinaryFieldValue updated likewise. - NEW TestRedisDB_HashBinaryFieldNameRoundTripsViaArray drives the exact Codex collision pair (UTF-8 `%FF` + binary 0xFF) and asserts both names survive distinctly with their expected types. Verified: linux race tests, GOOS=windows, GOOS=js wasm, lint — all clean.
1 parent 72e2b75 commit cacd448

2 files changed

Lines changed: 170 additions & 41 deletions

File tree

internal/backup/redis_hash.go

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ func (r *RedisDB) HandleHashMeta(key, value []byte) error {
6363
st := r.hashState(userKey)
6464
st.declaredLen = int64(binary.BigEndian.Uint64(value)) //nolint:gosec // signed int64 by design
6565
st.metaSeen = true
66-
r.kindByKey[string(userKey)] = redisKindHash
6766
return nil
6867
}
6968

@@ -80,17 +79,31 @@ func (r *RedisDB) HandleHashField(key, value []byte) error {
8079
}
8180
st := r.hashState(userKey)
8281
st.fields[string(fieldName)] = bytes.Clone(value)
83-
r.kindByKey[string(userKey)] = redisKindHash
8482
return nil
8583
}
8684

87-
// hashState lazily creates per-key state.
85+
// hashState lazily creates per-key state. The `kindByKey` registration
86+
// lives here (Gemini medium PR #725 #1/#3) so every code path that
87+
// touches a hash state — meta, field, and the TTL-routing back-edge
88+
// from HandleTTL — agrees on the kind. Caller audit (per the loop's
89+
// "audit semantics-changing edits" rule):
90+
//
91+
// - HandleHashMeta and HandleHashField both want kindByKey set;
92+
// centralising here means the explicit assignment is no longer
93+
// needed at the call site.
94+
// - HandleTTL only ever calls hashState() inside the
95+
// `case redisKindHash:` branch, where kindByKey == redisKindHash
96+
// already holds; the assignment here is idempotent for that path.
97+
// - No other caller exists; verified via
98+
// `grep -n "r\.hashState(" internal/backup/`.
8899
func (r *RedisDB) hashState(userKey []byte) *redisHashState {
89-
if st, ok := r.hashes[string(userKey)]; ok {
100+
uk := string(userKey)
101+
if st, ok := r.hashes[uk]; ok {
90102
return st
91103
}
92104
st := &redisHashState{fields: make(map[string][]byte)}
93-
r.hashes[string(userKey)] = st
105+
r.hashes[uk] = st
106+
r.kindByKey[uk] = redisKindHash
94107
return st
95108
}
96109

@@ -122,15 +135,20 @@ func parseHashFieldKey(key []byte) ([]byte, []byte, bool) {
122135
// parseUserKeyLenPrefix decodes the shared <len(4)><userKey> shape used
123136
// by every wide-column !hs|/!st|/!zs| key. Returns the userKey slice
124137
// (aliasing the input) plus a presence flag.
138+
//
139+
// The length comparison is done in uint64 space because on 32-bit
140+
// architectures `int(uint32)` can wrap to a negative value when the
141+
// high bit is set, bypassing the bounds check and causing a slice
142+
// panic. Gemini high finding (PR #725 round 1).
125143
func parseUserKeyLenPrefix(b []byte) ([]byte, bool) {
126144
if len(b) < hashUserKeyLenSize {
127145
return nil, false
128146
}
129-
ukLen := int(binary.BigEndian.Uint32(b[:hashUserKeyLenSize])) //nolint:gosec // bounded by len(b)
130-
if len(b) < hashUserKeyLenSize+ukLen {
147+
ukLen := binary.BigEndian.Uint32(b[:hashUserKeyLenSize])
148+
if uint64(len(b)) < uint64(hashUserKeyLenSize)+uint64(ukLen) {
131149
return nil, false
132150
}
133-
return b[hashUserKeyLenSize : hashUserKeyLenSize+ukLen], true
151+
return b[hashUserKeyLenSize : hashUserKeyLenSize+int(ukLen)], true //nolint:gosec // bounded above
134152
}
135153

136154
// flushHashes writes one JSON file per accumulated hash to
@@ -186,37 +204,49 @@ func (r *RedisDB) writeHashJSON(dir string, userKey []byte, st *redisHashState)
186204
return nil
187205
}
188206

189-
// hashJSONFields marshals the per-field map. Field NAMES are emitted
190-
// as JSON object keys, which forces them into UTF-8 string territory:
191-
// when a name is not valid UTF-8 we percent-encode it via
192-
// EncodeSegment so the output is reversible without changing JSON
193-
// shape. Field VALUES preserve full binary fidelity via the same
194-
// `{"base64":...}` envelope used for SQS bodies (see sqsMessageBody).
195-
type hashJSONFields map[string]json.RawMessage
207+
// hashFieldRecord is the dump-format projection of one Redis hash
208+
// field. Both name and value go through the same UTF-8-or-base64
209+
// envelope (json.RawMessage produced by marshalRedisBinaryValue) so
210+
// arbitrary binary bytes round-trip without lossy rewrites.
211+
//
212+
// We deliberately emit `fields` as an ARRAY of records rather than a
213+
// JSON object keyed on the field name, because Redis hash field
214+
// names are binary-safe and JSON object keys are not. With a map
215+
// shape, two distinct fields could collapse to the same JSON key
216+
// (Codex P1 round 12 #725: a UTF-8 literal `%FF` and a single byte
217+
// `0xFF` both percent-encoded to `%FF` would overwrite one
218+
// another), and a >240-byte non-UTF-8 field name would route
219+
// through EncodeSegment's SHA fallback which is non-reversible at
220+
// this layer (no per-field KEYMAP). The array form keeps every
221+
// (name, value) pair distinct and binary-safe.
222+
type hashFieldRecord struct {
223+
Name json.RawMessage `json:"name"`
224+
Value json.RawMessage `json:"value"`
225+
}
196226

197227
func marshalHashJSON(st *redisHashState) ([]byte, error) {
198-
fields := make(hashJSONFields, len(st.fields))
199-
// Sort for deterministic output across runs.
228+
// Sort by raw byte order for deterministic output across runs.
200229
names := make([]string, 0, len(st.fields))
201230
for name := range st.fields {
202231
names = append(names, name)
203232
}
204233
sort.Strings(names)
234+
fields := make([]hashFieldRecord, 0, len(names))
205235
for _, name := range names {
206-
jsonName := name
207-
if !utf8.ValidString(name) {
208-
jsonName = EncodeSegment([]byte(name))
236+
nameJSON, err := marshalRedisBinaryValue([]byte(name))
237+
if err != nil {
238+
return nil, err
209239
}
210240
valueJSON, err := marshalRedisBinaryValue(st.fields[name])
211241
if err != nil {
212242
return nil, err
213243
}
214-
fields[jsonName] = valueJSON
244+
fields = append(fields, hashFieldRecord{Name: nameJSON, Value: valueJSON})
215245
}
216246
type out struct {
217-
FormatVersion uint32 `json:"format_version"`
218-
Fields hashJSONFields `json:"fields"`
219-
ExpireAtMs *uint64 `json:"expire_at_ms"`
247+
FormatVersion uint32 `json:"format_version"`
248+
Fields []hashFieldRecord `json:"fields"`
249+
ExpireAtMs *uint64 `json:"expire_at_ms"`
220250
}
221251
rec := out{FormatVersion: 1, Fields: fields}
222252
if st.hasTTL {

internal/backup/redis_hash_test.go

Lines changed: 116 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,43 @@ func hashFloat(t *testing.T, m map[string]any, key string) float64 {
7070
return f
7171
}
7272

73-
func hashSubMap(t *testing.T, m map[string]any, key string) map[string]any {
73+
// hashFieldArray fetches the `fields` JSON array from a decoded hash
74+
// JSON file and returns it as a slice of {name, value} maps. The
75+
// dump-format projection emits fields as an ARRAY of records (not a
76+
// JSON object) so binary-safe Redis field NAMES round-trip without
77+
// collision — see the comment on hashFieldRecord in redis_hash.go.
78+
func hashFieldArray(t *testing.T, m map[string]any) []map[string]any {
7479
t.Helper()
75-
v, ok := m[key]
80+
v, ok := m["fields"]
7681
if !ok {
77-
t.Fatalf("field %q missing in %+v", key, m)
82+
t.Fatalf("fields missing in %+v", m)
7883
}
79-
sub, ok := v.(map[string]any)
84+
raw, ok := v.([]any)
8085
if !ok {
81-
t.Fatalf("field %q = %T(%v), want map[string]any", key, v, v)
86+
t.Fatalf("fields = %T(%v), want []any", v, v)
8287
}
83-
return sub
88+
out := make([]map[string]any, 0, len(raw))
89+
for i, r := range raw {
90+
rm, ok := r.(map[string]any)
91+
if !ok {
92+
t.Fatalf("fields[%d] = %T(%v), want map[string]any", i, r, r)
93+
}
94+
out = append(out, rm)
95+
}
96+
return out
97+
}
98+
99+
// hashFieldByName looks up a single field in the array shape and
100+
// returns its `value` element (UTF-8 string or base64 envelope).
101+
func hashFieldByName(t *testing.T, m map[string]any, name string) any {
102+
t.Helper()
103+
for _, f := range hashFieldArray(t, m) {
104+
if f["name"] == name {
105+
return f["value"]
106+
}
107+
}
108+
t.Fatalf("field %q not found in %+v", name, m["fields"])
109+
return nil
84110
}
85111

86112
func TestRedisDB_HashRoundTripBasic(t *testing.T) {
@@ -105,12 +131,11 @@ func TestRedisDB_HashRoundTripBasic(t *testing.T) {
105131
if got["expire_at_ms"] != nil {
106132
t.Fatalf("expire_at_ms must be nil without TTL, got %v", got["expire_at_ms"])
107133
}
108-
fields := hashSubMap(t, got, "fields")
109-
if fields["name"] != "alice" {
110-
t.Fatalf("name = %v want alice", fields["name"])
134+
if v := hashFieldByName(t, got, "name"); v != "alice" {
135+
t.Fatalf("name = %v want alice", v)
111136
}
112-
if fields["age"] != "30" {
113-
t.Fatalf("age = %v want 30", fields["age"])
137+
if v := hashFieldByName(t, got, "age"); v != "30" {
138+
t.Fatalf("age = %v want 30", v)
114139
}
115140
}
116141

@@ -126,9 +151,8 @@ func TestRedisDB_HashEmptyHashStillEmitsFile(t *testing.T) {
126151
t.Fatal(err)
127152
}
128153
got := readHashJSON(t, filepath.Join(root, "redis", "db_0", "hashes", "empty.json"))
129-
fields := hashSubMap(t, got, "fields")
130-
if len(fields) != 0 {
131-
t.Fatalf("empty hash should emit empty fields object, got %v", fields)
154+
if fields := hashFieldArray(t, got); len(fields) != 0 {
155+
t.Fatalf("empty hash should emit empty fields array, got %v", fields)
132156
}
133157
}
134158

@@ -206,16 +230,91 @@ func TestRedisDB_HashBinaryFieldValueUsesBase64Envelope(t *testing.T) {
206230
t.Fatal(err)
207231
}
208232
got := readHashJSON(t, filepath.Join(root, "redis", "db_0", "hashes", "k.json"))
209-
fields := hashSubMap(t, got, "fields")
210-
envelope, ok := fields["blob"].(map[string]any)
233+
v := hashFieldByName(t, got, "blob")
234+
envelope, ok := v.(map[string]any)
211235
if !ok {
212-
t.Fatalf("expected base64 envelope for binary value, got %T(%v)", fields["blob"], fields["blob"])
236+
t.Fatalf("expected base64 envelope for binary value, got %T(%v)", v, v)
213237
}
214238
if envelope["base64"] == "" {
215239
t.Fatalf("base64 envelope missing payload: %v", envelope)
216240
}
217241
}
218242

243+
// TestRedisDB_HashBinaryFieldNameRoundTripsViaArray covers the Codex
244+
// P1 round-12 scenario: a UTF-8-literal field name `%FF` and a
245+
// 1-byte binary field name `0xFF` would collide if `fields` were a
246+
// JSON object, because EncodeSegment percent-encodes 0xFF to `%FF`.
247+
// With the array-of-records shape both names round-trip
248+
// independently — the binary name uses the typed base64 envelope
249+
// while the UTF-8 literal stays as a plain JSON string.
250+
func TestRedisDB_HashBinaryFieldNameRoundTripsViaArray(t *testing.T) {
251+
t.Parallel()
252+
db, root := newRedisDB(t)
253+
if err := db.HandleHashMeta(hashMetaKey("k"), encodeHashMetaValue(2)); err != nil {
254+
t.Fatal(err)
255+
}
256+
if err := db.HandleHashField(hashFieldKey("k", "%FF"), []byte("literal-percent-ff")); err != nil {
257+
t.Fatal(err)
258+
}
259+
if err := db.HandleHashField(append(hashFieldKey("k", ""), 0xff), []byte("binary-byte")); err != nil {
260+
t.Fatal(err)
261+
}
262+
if err := db.Finalize(); err != nil {
263+
t.Fatal(err)
264+
}
265+
got := readHashJSON(t, filepath.Join(root, "redis", "db_0", "hashes", "k.json"))
266+
fields := hashFieldArray(t, got)
267+
if len(fields) != 2 {
268+
t.Fatalf("len(fields) = %d, want 2 (binary and literal must be distinct)", len(fields))
269+
}
270+
literalSeen, binarySeen := classifyHashFieldShapes(t, fields)
271+
if !literalSeen {
272+
t.Fatalf("literal %%FF field missing from %+v", fields)
273+
}
274+
if !binarySeen {
275+
t.Fatalf("binary 0xFF field missing from %+v", fields)
276+
}
277+
}
278+
279+
// classifyHashFieldShapes folds the binary-vs-literal field-shape
280+
// assertions out of the parent test so cyclomatic complexity stays
281+
// under the project linter's ceiling. Returns (sawUTF8Literal,
282+
// sawBase64Envelope).
283+
func classifyHashFieldShapes(t *testing.T, fields []map[string]any) (bool, bool) {
284+
t.Helper()
285+
var literalSeen, binarySeen bool
286+
for _, f := range fields {
287+
switch n := f["name"].(type) {
288+
case string:
289+
if n == "%FF" {
290+
literalSeen = true
291+
if f["value"] != "literal-percent-ff" {
292+
t.Fatalf("literal value = %v", f["value"])
293+
}
294+
}
295+
case map[string]any:
296+
if n["base64"] == base64URLEncode(0xff) {
297+
binarySeen = true
298+
if f["value"] != "binary-byte" {
299+
t.Fatalf("binary value = %v", f["value"])
300+
}
301+
}
302+
default:
303+
t.Fatalf("unexpected name type %T(%v)", f["name"], f["name"])
304+
}
305+
}
306+
return literalSeen, binarySeen
307+
}
308+
309+
func base64URLEncode(b byte) string {
310+
// base64.RawURLEncoding of a single byte; matches what
311+
// marshalRedisBinaryValue emits for non-UTF-8 content.
312+
const alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
313+
hi := alphabet[b>>2]
314+
lo := alphabet[(b&0x3)<<4]
315+
return string([]byte{hi, lo})
316+
}
317+
219318
func TestRedisDB_HashRejectsMalformedMetaValueLength(t *testing.T) {
220319
t.Parallel()
221320
db, _ := newRedisDB(t)

0 commit comments

Comments
 (0)