diff --git a/internal/backup/dynamodb.go b/internal/backup/dynamodb.go new file mode 100644 index 00000000..f3ec74bb --- /dev/null +++ b/internal/backup/dynamodb.go @@ -0,0 +1,482 @@ +package backup + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + pb "github.com/bootjp/elastickv/proto" + "github.com/cockroachdb/errors" + gproto "google.golang.org/protobuf/proto" +) + +// Snapshot prefixes the DynamoDB encoder dispatches on. Mirror the live +// constants in kv/shard_key.go (DynamoTableMetaPrefix etc.) so a renamed +// prefix is caught by the dispatch tests below. +const ( + DDBTableMetaPrefix = "!ddb|meta|table|" + DDBTableGenPrefix = "!ddb|meta|gen|" + DDBItemPrefix = "!ddb|item|" + DDBGSIPrefix = "!ddb|gsi|" +) + +// Stored value magic prefixes. Mirror adapter/dynamodb_storage_codec.go:15-16. +// Values that lack the right magic are rejected as either future-schema or +// genuinely corrupt — Phase 0a does not silently emit garbage. +var ( + storedDDBSchemaMagic = []byte{0x00, 'D', 'S', 0x01} + storedDDBItemMagic = []byte{0x00, 'D', 'I', 0x01} +) + +// ErrDDBInvalidSchema, ErrDDBInvalidItem, ErrDDBMalformedKey are the +// typed error classes for this encoder. Surface via errors.Is. +var ( + ErrDDBInvalidSchema = errors.New("backup: invalid !ddb|meta|table value") + ErrDDBInvalidItem = errors.New("backup: invalid !ddb|item value") + ErrDDBMalformedKey = errors.New("backup: malformed DynamoDB key") +) + +// DDBEncoder encodes the DynamoDB prefix family into the per-table layout +// described in docs/design/2026_04_29_proposed_snapshot_logical_decoder.md +// (Phase 0): one `_schema.json` per table and one +// `items//.json` per item (default per-item layout). +// +// Lifecycle: Handle* per record, Finalize once. Items arrive before the +// schema in lex order ('i' < 'm' under !ddb|), so the encoder buffers +// per-encoded-table-segment and emits at Finalize once the schema is +// known. +// +// Wide-column GSI rows (!ddb|gsi|*) are NOT dumped: they are derivable +// from the base item set + schema, and replaying GSI rows on restore +// would conflict with the destination's own index maintenance. +type DDBEncoder struct { + outRoot string + bundleJSONL bool + + tables map[string]*ddbTableState + + warn func(event string, fields ...any) +} + +type ddbTableState struct { + encoded string + name string + schema *pb.DynamoTableSchema + items []*pb.DynamoItem +} + +// NewDDBEncoder constructs an encoder rooted at /dynamodb/. +func NewDDBEncoder(outRoot string) *DDBEncoder { + return &DDBEncoder{ + outRoot: outRoot, + tables: make(map[string]*ddbTableState), + } +} + +// WithBundleJSONL switches per-table layout to `items/data-.jsonl` +// (one item per line). Default is per-item files. The choice is recorded +// in MANIFEST.json (`dynamodb_layout`) by the master pipeline; the +// encoder itself only needs the flag to pick the on-disk shape. +// +// Bundle mode is a follow-up: this PR ships per-item only. Calling +// WithBundleJSONL(true) returns an error from Finalize until the bundle +// path lands. +func (d *DDBEncoder) WithBundleJSONL(on bool) *DDBEncoder { + d.bundleJSONL = on + return d +} + +// WithWarnSink wires structured-warning emission (orphan items, +// schema-less tables, etc.). +func (d *DDBEncoder) WithWarnSink(fn func(event string, fields ...any)) *DDBEncoder { + d.warn = fn + return d +} + +// HandleTableMeta processes a !ddb|meta|table| record. +// Strips the magic prefix, proto-unmarshals into DynamoTableSchema, and +// parks it on the per-table state. +func (d *DDBEncoder) HandleTableMeta(key, value []byte) error { + encoded, err := stripPrefixSegment(key, []byte(DDBTableMetaPrefix)) + if err != nil { + return errors.Wrap(ErrDDBMalformedKey, err.Error()) + } + rawName, err := base64.RawURLEncoding.DecodeString(encoded) + if err != nil { + return errors.Wrap(ErrDDBMalformedKey, err.Error()) + } + if !bytes.HasPrefix(value, storedDDBSchemaMagic) { + return errors.Wrap(ErrDDBInvalidSchema, "missing magic prefix") + } + body := value[len(storedDDBSchemaMagic):] + schema := &pb.DynamoTableSchema{} + if err := gproto.Unmarshal(body, schema); err != nil { + return errors.Wrap(ErrDDBInvalidSchema, err.Error()) + } + st := d.tableState(encoded) + st.name = string(rawName) + st.schema = schema + return nil +} + +// HandleItem processes a !ddb|item||| record. The +// encoded table segment is parsed out of the key (everything between +// the first and second `|` after stripping `!ddb|item|`) and the item +// proto is buffered until Finalize. We do NOT parse the rest of the +// key here: every primary-key value the item could hold is also +// present in the proto's attributes map, and the schema (which arrives +// later in lex order) is what tells us which attributes are the hash +// and range keys. +func (d *DDBEncoder) HandleItem(key, value []byte) error { + encoded, err := parseDDBItemKey(key) + if err != nil { + return err + } + if !bytes.HasPrefix(value, storedDDBItemMagic) { + return errors.Wrap(ErrDDBInvalidItem, "missing magic prefix") + } + body := value[len(storedDDBItemMagic):] + item := &pb.DynamoItem{} + if err := gproto.Unmarshal(body, item); err != nil { + return errors.Wrap(ErrDDBInvalidItem, err.Error()) + } + st := d.tableState(encoded) + st.items = append(st.items, item) + return nil +} + +// HandleGSIRow drops GSI rows by default (they are derivable from the +// base item set + schema). Exposed as a no-op so the master pipeline +// can dispatch all !ddb|* prefixes uniformly without special-casing. +func (d *DDBEncoder) HandleGSIRow(_, _ []byte) error { return nil } + +// HandleTableGen drops the per-table generation counter (operational +// state, not user-visible). +func (d *DDBEncoder) HandleTableGen(_, _ []byte) error { return nil } + +// Finalize emits each table's _schema.json and per-item JSON files. +// Tables with items but no schema (orphans — e.g., the schema record +// was lost or excluded) emit a warning and are skipped. Tables with +// a schema but no items emit a _schema.json and an empty items/ +// directory. +func (d *DDBEncoder) Finalize() error { + if d.bundleJSONL { + return errors.New("backup: dynamodb_layout=jsonl not implemented in this PR") + } + var firstErr error + for _, st := range d.tables { + if st.schema == nil { + d.emitWarn("ddb_orphan_items", + "encoded_table", st.encoded, + "buffered_items", len(st.items)) + continue + } + if err := d.flushTable(st); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func (d *DDBEncoder) flushTable(st *ddbTableState) error { + dir := filepath.Join(d.outRoot, "dynamodb", EncodeSegment([]byte(st.name))) + itemsDir := filepath.Join(dir, "items") + if err := os.MkdirAll(itemsDir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + if err := writeFileAtomic(filepath.Join(dir, "_schema.json"), mustMarshalIndent(schemaToPublic(st.schema))); err != nil { + return err + } + hashKey := st.schema.GetPrimaryKey().GetHashKey() + rangeKey := st.schema.GetPrimaryKey().GetRangeKey() + for _, item := range st.items { + if err := writeDDBItem(itemsDir, hashKey, rangeKey, item); err != nil { + return err + } + } + return nil +} + +func (d *DDBEncoder) emitWarn(event string, fields ...any) { + if d.warn == nil { + return + } + d.warn(event, fields...) +} + +func (d *DDBEncoder) tableState(encoded string) *ddbTableState { + if st, ok := d.tables[encoded]; ok { + return st + } + st := &ddbTableState{encoded: encoded} + d.tables[encoded] = st + return st +} + +// parseDDBItemKey extracts the encoded table segment from +// !ddb|item|||. base64url does not contain `|`, +// so a strict `|` split between the prefix and the gen is unambiguous. +func parseDDBItemKey(key []byte) (string, error) { + rest, err := stripPrefixSegment(key, []byte(DDBItemPrefix)) + if err != nil { + return "", errors.Wrap(ErrDDBMalformedKey, err.Error()) + } + idx := strings.IndexByte(rest, '|') + if idx <= 0 { + return "", errors.Wrapf(ErrDDBMalformedKey, + "item key missing table/gen separator: %q", key) + } + enc := rest[:idx] + if _, err := base64.RawURLEncoding.DecodeString(enc); err != nil { + return "", errors.Wrap(ErrDDBMalformedKey, err.Error()) + } + return enc, nil +} + +// writeDDBItem emits one item under itemsDir/[/].json. The +// hash-only and composite-key shapes match the design's two examples. +// A missing hash-key attribute on an item is a structural error (the +// item could never have been GetItem-able without one) and surfaces +// as ErrDDBInvalidItem. +func writeDDBItem(itemsDir, hashKey, rangeKey string, item *pb.DynamoItem) error { + attrs := item.GetAttributes() + hashVal, ok := attrs[hashKey] + if !ok { + return errors.Wrapf(ErrDDBInvalidItem, + "item missing hash-key attribute %q", hashKey) + } + hashFilename, err := ddbKeyAttrToSegment(hashVal) + if err != nil { + return err + } + publicItem := itemToPublic(item) + body, err := json.MarshalIndent(publicItem, "", " ") + if err != nil { + return errors.WithStack(err) + } + if rangeKey == "" { + return writeFileAtomic(filepath.Join(itemsDir, hashFilename+".json"), body) + } + rangeVal, ok := attrs[rangeKey] + if !ok { + return errors.Wrapf(ErrDDBInvalidItem, + "item missing range-key attribute %q", rangeKey) + } + rangeFilename, err := ddbKeyAttrToSegment(rangeVal) + if err != nil { + return err + } + dir := filepath.Join(itemsDir, hashFilename) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + return writeFileAtomic(filepath.Join(dir, rangeFilename+".json"), body) +} + +// ddbKeyAttrToSegment encodes a primary-key attribute (S, N, or B) to +// a filesystem-safe segment. Per the design, S and N take the standard +// EncodeSegment path; B takes EncodeBinarySegment so binary keys never +// collide with string keys whose hex shape happens to look like +// base64. +// +// All other attribute kinds are rejected — DynamoDB primary keys can +// only be S, N, or B. +func ddbKeyAttrToSegment(av *pb.DynamoAttributeValue) (string, error) { + switch v := av.GetValue().(type) { + case *pb.DynamoAttributeValue_S: + return EncodeSegment([]byte(v.S)), nil + case *pb.DynamoAttributeValue_N: + return EncodeSegment([]byte(v.N)), nil + case *pb.DynamoAttributeValue_B: + return EncodeBinarySegment(v.B), nil + } + return "", errors.Wrapf(ErrDDBInvalidItem, + "primary key has unsupported attribute kind %T", av.GetValue()) +} + +// schemaToPublic projects DynamoTableSchema into the AWS-DescribeTable +// JSON shape documented in the design. Fields the live record carries +// for cluster-internal reasons (key_encoding_version, generation, +// migrating_from_generation) are stripped — they are not part of the +// user-visible schema and would not be re-applicable on restore. +func schemaToPublic(s *pb.DynamoTableSchema) ddbPublicSchema { + pk := publicKeySchema{ + HashKey: publicKeyAttribute{Name: s.GetPrimaryKey().GetHashKey()}, + RangeKey: publicKeyAttribute{Name: s.GetPrimaryKey().GetRangeKey()}, + } + if s.GetPrimaryKey().GetRangeKey() == "" { + pk.RangeKey = publicKeyAttribute{} + } + defs := make(map[string]string, len(s.GetAttributeDefinitions())) + for k, v := range s.GetAttributeDefinitions() { + defs[k] = v + } + pk.HashKey.Type = defs[pk.HashKey.Name] + if pk.RangeKey.Name != "" { + pk.RangeKey.Type = defs[pk.RangeKey.Name] + } + gsis := make([]publicGSI, 0, len(s.GetGlobalSecondaryIndexes())) + for name, gsi := range s.GetGlobalSecondaryIndexes() { + g := publicGSI{ + Name: name, + KeySchema: publicKeySchema{ + HashKey: publicKeyAttribute{Name: gsi.GetKeySchema().GetHashKey()}, + RangeKey: publicKeyAttribute{Name: gsi.GetKeySchema().GetRangeKey()}, + }, + } + g.KeySchema.HashKey.Type = defs[g.KeySchema.HashKey.Name] + if g.KeySchema.RangeKey.Name != "" { + g.KeySchema.RangeKey.Type = defs[g.KeySchema.RangeKey.Name] + } else { + g.KeySchema.RangeKey = publicKeyAttribute{} + } + g.Projection.Type = gsi.GetProjection().GetProjectionType() + g.Projection.NonKeyAttributes = append([]string{}, gsi.GetProjection().GetNonKeyAttributes()...) + gsis = append(gsis, g) + } + attrDefs := make([]publicAttributeDefinition, 0, len(defs)) + for name, ty := range defs { + attrDefs = append(attrDefs, publicAttributeDefinition{Name: name, Type: ty}) + } + return ddbPublicSchema{ + FormatVersion: 1, + TableName: s.GetTableName(), + PrimaryKey: pk, + AttributeDefinitions: attrDefs, + GlobalSecondaryIndexes: gsis, + } +} + +type ddbPublicSchema struct { + FormatVersion uint32 `json:"format_version"` + TableName string `json:"table_name"` + PrimaryKey publicKeySchema `json:"primary_key"` + AttributeDefinitions []publicAttributeDefinition `json:"attribute_definitions"` + GlobalSecondaryIndexes []publicGSI `json:"global_secondary_indexes,omitempty"` +} + +type publicKeySchema struct { + HashKey publicKeyAttribute `json:"hash_key"` + RangeKey publicKeyAttribute `json:"range_key,omitempty"` +} + +type publicKeyAttribute struct { + Name string `json:"name,omitempty"` + Type string `json:"type,omitempty"` +} + +type publicAttributeDefinition struct { + Name string `json:"name"` + Type string `json:"type"` +} + +type publicGSI struct { + Name string `json:"name"` + KeySchema publicKeySchema `json:"key_schema"` + Projection publicProjection `json:"projection"` +} + +type publicProjection struct { + Type string `json:"type"` + NonKeyAttributes []string `json:"non_key_attributes,omitempty"` +} + +// itemToPublic translates a DynamoItem proto into the AWS-DynamoDB-JSON +// shape: a top-level map of attribute name -> typed-attribute object. +// The attribute objects use the standard AWS keys (S, N, B, BOOL, +// NULL, SS, NS, BS, L, M). +func itemToPublic(item *pb.DynamoItem) map[string]any { + out := make(map[string]any, len(item.GetAttributes())) + for name, av := range item.GetAttributes() { + out[name] = attributeValueToPublic(av) + } + return out +} + +func attributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { + if scalar := scalarAttributeValueToPublic(av); scalar != nil { + return scalar + } + if set := setAttributeValueToPublic(av); set != nil { + return set + } + if comp := compositeAttributeValueToPublic(av); comp != nil { + return comp + } + // Empty oneof. AWS treats this as malformed; preserve as NULL so + // the dump remains deserialisable rather than embedding an empty + // object that downstream tools might reject. + return map[string]any{"NULL": true} +} + +func scalarAttributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { + switch v := av.GetValue().(type) { + case *pb.DynamoAttributeValue_S: + return map[string]any{"S": v.S} + case *pb.DynamoAttributeValue_N: + return map[string]any{"N": v.N} + case *pb.DynamoAttributeValue_B: + return map[string]any{"B": v.B} + case *pb.DynamoAttributeValue_BoolValue: + return map[string]any{"BOOL": v.BoolValue} + case *pb.DynamoAttributeValue_NullValue: + return map[string]any{"NULL": v.NullValue} + } + return nil +} + +func setAttributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { + switch v := av.GetValue().(type) { + case *pb.DynamoAttributeValue_Ss: + return map[string]any{"SS": append([]string{}, v.Ss.GetValues()...)} + case *pb.DynamoAttributeValue_Ns: + return map[string]any{"NS": append([]string{}, v.Ns.GetValues()...)} + case *pb.DynamoAttributeValue_Bs: + return map[string]any{"BS": append([][]byte{}, v.Bs.GetValues()...)} + } + return nil +} + +func compositeAttributeValueToPublic(av *pb.DynamoAttributeValue) map[string]any { + switch v := av.GetValue().(type) { + case *pb.DynamoAttributeValue_L: + out := make([]map[string]any, 0, len(v.L.GetValues())) + for _, child := range v.L.GetValues() { + out = append(out, attributeValueToPublic(child)) + } + return map[string]any{"L": out} + case *pb.DynamoAttributeValue_M: + out := make(map[string]any, len(v.M.GetValues())) + for k, child := range v.M.GetValues() { + out[k] = attributeValueToPublic(child) + } + return map[string]any{"M": out} + } + return nil +} + +// EncodeDDBItemKey constructs a !ddb|item key for tests. Mirrors the +// live legacyDynamoItemKey constructor in adapter/dynamodb.go (string +// hash + range, simplest shape). +func EncodeDDBItemKey(tableName string, generation uint64, hashKey, rangeKey string) []byte { + out := []byte(DDBItemPrefix) + out = append(out, base64.RawURLEncoding.EncodeToString([]byte(tableName))...) + out = append(out, '|') + out = append(out, fmt.Sprintf("%d", generation)...) + out = append(out, '|') + out = append(out, base64.RawURLEncoding.EncodeToString([]byte(hashKey))...) + if rangeKey != "" { + out = append(out, '|') + out = append(out, base64.RawURLEncoding.EncodeToString([]byte(rangeKey))...) + } + return out +} + +// EncodeDDBTableMetaKey constructs a !ddb|meta|table key for tests. +func EncodeDDBTableMetaKey(tableName string) []byte { + return []byte(DDBTableMetaPrefix + base64.RawURLEncoding.EncodeToString([]byte(tableName))) +} diff --git a/internal/backup/dynamodb_test.go b/internal/backup/dynamodb_test.go new file mode 100644 index 00000000..122bc271 --- /dev/null +++ b/internal/backup/dynamodb_test.go @@ -0,0 +1,343 @@ +package backup + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + pb "github.com/bootjp/elastickv/proto" + "github.com/cockroachdb/errors" + gproto "google.golang.org/protobuf/proto" +) + +func encodeSchemaValue(t *testing.T, schema *pb.DynamoTableSchema) []byte { + t.Helper() + body, err := gproto.Marshal(schema) + if err != nil { + t.Fatalf("marshal schema: %v", err) + } + out := append([]byte{}, storedDDBSchemaMagic...) + return append(out, body...) +} + +func encodeItemValue(t *testing.T, item *pb.DynamoItem) []byte { + t.Helper() + body, err := gproto.Marshal(item) + if err != nil { + t.Fatalf("marshal item: %v", err) + } + out := append([]byte{}, storedDDBItemMagic...) + return append(out, body...) +} + +func sAttr(s string) *pb.DynamoAttributeValue { + return &pb.DynamoAttributeValue{Value: &pb.DynamoAttributeValue_S{S: s}} +} + +func nAttr(n string) *pb.DynamoAttributeValue { + return &pb.DynamoAttributeValue{Value: &pb.DynamoAttributeValue_N{N: n}} +} + +func bAttr(b []byte) *pb.DynamoAttributeValue { + return &pb.DynamoAttributeValue{Value: &pb.DynamoAttributeValue_B{B: b}} +} + +func boolAttr(b bool) *pb.DynamoAttributeValue { + return &pb.DynamoAttributeValue{Value: &pb.DynamoAttributeValue_BoolValue{BoolValue: b}} +} + +func newDDBEncoder(t *testing.T) (*DDBEncoder, string) { + t.Helper() + root := t.TempDir() + return NewDDBEncoder(root), root +} + +func readPublicSchema(t *testing.T, path string) ddbPublicSchema { + t.Helper() + body, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read schema: %v", err) + } + var got ddbPublicSchema + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal schema: %v", err) + } + return got +} + +func readItemMap(t *testing.T, path string) map[string]any { + t.Helper() + body, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read item: %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal item: %v", err) + } + return got +} + +func mustSubMap(t *testing.T, m map[string]any, key string) map[string]any { + t.Helper() + v, ok := m[key].(map[string]any) + if !ok { + t.Fatalf("field %q wrong shape: %v", key, m[key]) + } + return v +} + +func TestDDB_HashOnlyTableRoundTrip(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "sessions", + PrimaryKey: &pb.DynamoKeySchema{HashKey: "session_id"}, + AttributeDefinitions: map[string]string{"session_id": "S"}, + Generation: 1, + } + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "session_id": sAttr("sess-abc123"), + "user_id": sAttr("alice"), + "flags": boolAttr(true), + "count": nAttr("42"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("sessions", 1, "sess-abc123", ""), encodeItemValue(t, item)); err != nil { + t.Fatalf("HandleItem: %v", err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("sessions"), encodeSchemaValue(t, schema)); err != nil { + t.Fatalf("HandleTableMeta: %v", err) + } + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + + got := readPublicSchema(t, filepath.Join(root, "dynamodb", "sessions", "_schema.json")) + if got.TableName != "sessions" { + t.Fatalf("table_name = %q", got.TableName) + } + if got.PrimaryKey.HashKey.Name != "session_id" || got.PrimaryKey.HashKey.Type != "S" { + t.Fatalf("primary_key = %+v", got.PrimaryKey) + } + if got.PrimaryKey.RangeKey.Name != "" { + t.Fatalf("hash-only table must have empty range_key, got %+v", got.PrimaryKey.RangeKey) + } + + asMap := readItemMap(t, filepath.Join(root, "dynamodb", "sessions", "items", "sess-abc123.json")) + if mustSubMap(t, asMap, "session_id")["S"] != "sess-abc123" { + t.Fatalf("session_id.S = %v", asMap["session_id"]) + } + if mustSubMap(t, asMap, "flags")["BOOL"] != true { + t.Fatalf("flags.BOOL = %v", asMap["flags"]) + } +} + +func TestDDB_CompositeKeyTableRoundTrip(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "orders", + PrimaryKey: &pb.DynamoKeySchema{ + HashKey: "customer_id", + RangeKey: "order_ts", + }, + AttributeDefinitions: map[string]string{ + "customer_id": "S", + "order_ts": "S", + }, + Generation: 1, + } + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "customer_id": sAttr("customer-7421"), + "order_ts": sAttr("2026-04-29T12:00:00Z"), + "total": nAttr("129.50"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("orders", 1, "customer-7421", "2026-04-29T12:00:00Z"), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("orders"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + want := filepath.Join(root, "dynamodb", "orders", "items", "customer-7421", "2026-04-29T12%3A00%3A00Z.json") + if _, err := os.Stat(want); err != nil { + t.Fatalf("expected %s, stat err=%v", want, err) + } +} + +func TestDDB_BinaryHashKeyRendersAsB64Prefix(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "blobs", + PrimaryKey: &pb.DynamoKeySchema{ + HashKey: "id", + }, + AttributeDefinitions: map[string]string{"id": "B"}, + } + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": bAttr([]byte{0x00, 0x01, 0x02}), + "data": sAttr("v"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("blobs", 1, "doesnt-matter", ""), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("blobs"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + want := filepath.Join(root, "dynamodb", "blobs", "items", "b64.AAEC.json") + if _, err := os.Stat(want); err != nil { + t.Fatalf("expected %s, stat err=%v", want, err) + } +} + +func TestDDB_OrphanItemsWithoutSchemaWarn(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + var events []string + enc.WithWarnSink(func(event string, _ ...any) { + events = append(events, event) + }) + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("orphan"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("ghost", 1, "orphan", ""), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if len(events) != 1 || events[0] != "ddb_orphan_items" { + t.Fatalf("events = %v", events) + } +} + +func TestDDB_RejectsValueWithoutMagic(t *testing.T) { + t.Parallel() + t.Run("schema", func(t *testing.T) { + enc, _ := newDDBEncoder(t) + err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), []byte("not-magic")) + if !errors.Is(err, ErrDDBInvalidSchema) { + t.Fatalf("err=%v", err) + } + }) + t.Run("item", func(t *testing.T) { + enc, _ := newDDBEncoder(t) + err := enc.HandleItem(EncodeDDBItemKey("t", 1, "h", ""), []byte("not-magic")) + if !errors.Is(err, ErrDDBInvalidItem) { + t.Fatalf("err=%v", err) + } + }) +} + +func TestDDB_RejectsItemMissingHashKeyAttribute(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "t", PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{"id": "S"}, + } + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + // "id" is missing + "other": sAttr("v"), + }} + if err := enc.HandleItem(EncodeDDBItemKey("t", 1, "x", ""), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("t"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + err := enc.Finalize() + if !errors.Is(err, ErrDDBInvalidItem) { + t.Fatalf("Finalize err=%v want ErrDDBInvalidItem", err) + } +} + +func TestDDB_GSIRowsIgnored(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + if err := enc.HandleGSIRow([]byte("!ddb|gsi|whatever"), []byte("opaque")); err != nil { + t.Fatalf("HandleGSIRow should be a no-op, err=%v", err) + } +} + +func TestDDB_AllAttributeKindsRoundTripThroughJSON(t *testing.T) { + t.Parallel() + enc, root := newDDBEncoder(t) + schema := &pb.DynamoTableSchema{ + TableName: "kitchensink", PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"}, + AttributeDefinitions: map[string]string{"id": "S"}, + } + item := &pb.DynamoItem{Attributes: map[string]*pb.DynamoAttributeValue{ + "id": sAttr("k"), + "s": sAttr("hi"), + "n": nAttr("1.5"), + "b": bAttr([]byte{0xff, 0x01}), + "bool_t": boolAttr(true), + "null_a": {Value: &pb.DynamoAttributeValue_NullValue{NullValue: true}}, + "ss": {Value: &pb.DynamoAttributeValue_Ss{Ss: &pb.DynamoStringSet{Values: []string{"a", "b"}}}}, + "ns": {Value: &pb.DynamoAttributeValue_Ns{Ns: &pb.DynamoNumberSet{Values: []string{"1", "2"}}}}, + "bs": {Value: &pb.DynamoAttributeValue_Bs{Bs: &pb.DynamoBinarySet{Values: [][]byte{{0x01}, {0x02}}}}}, + "l": {Value: &pb.DynamoAttributeValue_L{L: &pb.DynamoAttributeValueList{Values: []*pb.DynamoAttributeValue{sAttr("x"), nAttr("9")}}}}, + "m": {Value: &pb.DynamoAttributeValue_M{M: &pb.DynamoAttributeValueMap{Values: map[string]*pb.DynamoAttributeValue{"k1": sAttr("v1")}}}}, + }} + if err := enc.HandleItem(EncodeDDBItemKey("kitchensink", 1, "k", ""), encodeItemValue(t, item)); err != nil { + t.Fatal(err) + } + if err := enc.HandleTableMeta(EncodeDDBTableMetaKey("kitchensink"), encodeSchemaValue(t, schema)); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + got := readItemMap(t, filepath.Join(root, "dynamodb", "kitchensink", "items", "k.json")) + // Spot-check a few attributes; full per-attribute assertions live + // in the dedicated attributeValueToPublic tests below. + if mustSubMap(t, got, "s")["S"] != "hi" { + t.Fatalf("s = %v", got["s"]) + } + if mustSubMap(t, got, "bool_t")["BOOL"] != true { + t.Fatalf("bool_t = %v", got["bool_t"]) + } + lInner, ok := mustSubMap(t, got, "l")["L"].([]any) + if !ok { + t.Fatalf("l[\"L\"] wrong shape: %v", mustSubMap(t, got, "l")["L"]) + } + if len(lInner) != 2 { + t.Fatalf("l[\"L\"] len = %d want 2", len(lInner)) + } +} + +func TestDDB_AttributeValueToPublic_EmptyOneofSurfacedAsNull(t *testing.T) { + t.Parallel() + got := attributeValueToPublic(&pb.DynamoAttributeValue{}) + if got["NULL"] != true { + t.Fatalf("got %v want NULL=true", got) + } +} + +func TestDDB_BundleJSONLNotImplementedYet(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + enc.WithBundleJSONL(true) + err := enc.Finalize() + if err == nil { + t.Fatalf("expected not-implemented error from Finalize on bundle mode") + } +} + +func TestDDB_RejectsKeyWithMissingTableSegment(t *testing.T) { + t.Parallel() + enc, _ := newDDBEncoder(t) + // Missing the table segment entirely. + err := enc.HandleItem([]byte(DDBItemPrefix), []byte("ignored")) + if !errors.Is(err, ErrDDBMalformedKey) { + t.Fatalf("err=%v", err) + } +} diff --git a/internal/backup/filename.go b/internal/backup/filename.go new file mode 100644 index 00000000..d224a2ab --- /dev/null +++ b/internal/backup/filename.go @@ -0,0 +1,244 @@ +// Package backup implements the per-adapter logical-backup format defined in +// docs/design/2026_04_29_proposed_snapshot_logical_decoder.md (Phase 0) and +// reused by docs/design/2026_04_29_proposed_logical_backup.md (Phase 1). +// +// This file owns the filename encoding rules for non-S3 segments. S3 object +// keys preserve their `/` separators (and so are not transformed by EncodeSegment); +// every other adapter scope encodes user-supplied bytes through this path. +// +// Encoding rules (see "Filename encoding" in the Phase 0 doc): +// +// - Bytes in the unreserved set [A-Za-z0-9._-] pass through. +// - Every other byte is rendered as %HH (uppercase hex), like +// application/x-www-form-urlencoded but applied to every non-allowlisted byte. +// - If the encoded result exceeds maxSegmentBytes (240), the segment is +// replaced with __ and the full +// original bytes must be recorded in KEYMAP.jsonl by the caller. +// - Binary DynamoDB partition / sort keys take a separate "b64." +// path so a binary key never collides with a string key whose hex encoding +// happens to look like base64. EncodeBinarySegment emits that form. +package backup + +import ( + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "strings" + + "github.com/cockroachdb/errors" +) + +const ( + // maxSegmentBytes is the maximum length of a single encoded path segment + // before the SHA-fallback kicks in. Chosen to leave headroom under the + // common NAME_MAX of 255: two-character percent escapes can grow a 240-byte + // raw segment to 720 encoded bytes in the worst case, but any segment + // large enough to overflow NAME_MAX after expansion takes the SHA-fallback + // path before the encoded length is examined. + maxSegmentBytes = 240 + + // shaFallbackHexPrefixBytes is the number of hex characters of SHA-256 + // embedded in the SHA-fallback prefix. 32 hex chars == 128 bits of + // hash-prefix entropy — enough to make accidental collision negligible + // for any single scope. + shaFallbackHexPrefixBytes = 32 + + // shaFallbackTruncatedSuffixBytes is the number of leading bytes of the + // raw segment retained (after percent-encoding) in the SHA-fallback + // rendering. Total encoded segment is then at most: + // + // shaFallbackHexPrefixBytes + len("__") + 3*shaFallbackTruncatedSuffixBytes + // + // = 32 + 2 + 3*64 = 226 bytes (under the 240 ceiling). + // + // The truncated suffix is purely a human-recognisability aid; it does + // NOT carry enough information to reverse the original bytes — that is + // what KEYMAP.jsonl is for. + shaFallbackTruncatedSuffixBytes = 64 + + // binaryPrefix marks a DynamoDB B-attribute segment encoded as base64url. + binaryPrefix = "b64." + + // shaFallbackSeparator separates the SHA-256 prefix from the truncated + // original bytes. Two underscores rather than one because single + // underscores are common in user keys; doubled is much rarer and so + // the boundary is unambiguous. + shaFallbackSeparator = "__" +) + +// ErrInvalidEncodedSegment is returned by DecodeSegment when its input is +// neither a valid percent-encoded segment, a binary-prefixed segment, nor a +// SHA-fallback segment. +var ErrInvalidEncodedSegment = errors.New("backup: invalid encoded filename segment") + +// ErrShaFallbackNeedsKeymap is returned by DecodeSegment when its input is a +// SHA-fallback segment. The segment cannot be reversed to its original bytes +// from the filename alone — the caller must consult KEYMAP.jsonl. +var ErrShaFallbackNeedsKeymap = errors.New("backup: filename uses SHA fallback; consult KEYMAP.jsonl") + +// EncodeSegment encodes a single user-supplied path segment for use as a +// filename component. It is the inverse of DecodeSegment for non-fallback +// inputs. +// +// The encoding is deterministic and idempotent given the same input. +func EncodeSegment(raw []byte) string { + encoded := percentEncode(raw) + if len(encoded) <= maxSegmentBytes { + return encoded + } + return shaFallback(raw) +} + +// EncodeBinarySegment encodes a DynamoDB B-attribute (binary) segment as +// "b64." so that binary keys never collide with string +// keys whose hex-encoding happens to look like base64. +// +// b64-encoded segments take the SHA fallback if they exceed maxSegmentBytes +// after the base64 expansion (~4/3 of the raw length). +func EncodeBinarySegment(raw []byte) string { + enc := binaryPrefix + base64.RawURLEncoding.EncodeToString(raw) + if len(enc) <= maxSegmentBytes { + return enc + } + return shaFallback(raw) +} + +// DecodeSegment is the inverse of EncodeSegment for percent-encoded and +// binary-prefixed inputs. SHA-fallback inputs return ErrShaFallbackNeedsKeymap +// so the caller knows to consult KEYMAP.jsonl rather than treat the partial +// suffix as the original key. +func DecodeSegment(seg string) ([]byte, error) { + if isShaFallback(seg) { + return nil, errors.WithStack(ErrShaFallbackNeedsKeymap) + } + if strings.HasPrefix(seg, binaryPrefix) { + raw, err := base64.RawURLEncoding.DecodeString(seg[len(binaryPrefix):]) + if err != nil { + return nil, errors.Wrap(ErrInvalidEncodedSegment, err.Error()) + } + return raw, nil + } + return percentDecode(seg) +} + +// IsShaFallback reports whether seg uses the SHA-prefix-and-truncated-original +// form. Such segments cannot be reversed without KEYMAP.jsonl. +func IsShaFallback(seg string) bool { + return isShaFallback(seg) +} + +// IsBinarySegment reports whether seg is a base64-url encoded binary segment +// emitted by EncodeBinarySegment. +func IsBinarySegment(seg string) bool { + return strings.HasPrefix(seg, binaryPrefix) +} + +func percentEncode(raw []byte) string { + // Worst case: every byte expands to %HH (3 bytes). Pre-allocate. + var b strings.Builder + b.Grow(len(raw) * 3) //nolint:mnd // 3 == len("%HH"), local idiom + for _, c := range raw { + if isUnreserved(c) { + b.WriteByte(c) + continue + } + b.WriteByte('%') + b.WriteByte(hexUpper(c >> 4)) //nolint:mnd // 4 == nibble width + b.WriteByte(hexUpper(c & 0x0F)) //nolint:mnd // 0x0F == low-nibble mask + } + return b.String() +} + +func percentDecode(seg string) ([]byte, error) { + out := make([]byte, 0, len(seg)) + for i := 0; i < len(seg); i++ { + c := seg[i] + if c != '%' { + if !isUnreserved(c) { + return nil, errors.Wrapf(ErrInvalidEncodedSegment, + "unexpected raw byte 0x%02x at offset %d", c, i) + } + out = append(out, c) + continue + } + if i+2 >= len(seg) { //nolint:mnd // 2 == hex digit count after % + return nil, errors.Wrapf(ErrInvalidEncodedSegment, + "truncated percent escape at offset %d", i) + } + const ( + hiNibbleOff = 1 + loNibbleOff = 2 + ) + hi, ok := unhex(seg[i+hiNibbleOff]) + if !ok { + return nil, errors.Wrapf(ErrInvalidEncodedSegment, + "non-hex digit 0x%02x at offset %d", seg[i+hiNibbleOff], i+hiNibbleOff) + } + lo, ok := unhex(seg[i+loNibbleOff]) + if !ok { + return nil, errors.Wrapf(ErrInvalidEncodedSegment, + "non-hex digit 0x%02x at offset %d", seg[i+loNibbleOff], i+loNibbleOff) + } + out = append(out, (hi<<4)|lo) //nolint:mnd // 4 == nibble width + i += loNibbleOff // skip the two consumed hex digits + } + return out, nil +} + +func shaFallback(raw []byte) string { + sum := sha256.Sum256(raw) + hashHex := hex.EncodeToString(sum[:])[:shaFallbackHexPrefixBytes] + suffix := raw + if len(suffix) > shaFallbackTruncatedSuffixBytes { + suffix = suffix[:shaFallbackTruncatedSuffixBytes] + } + return hashHex + shaFallbackSeparator + percentEncode(suffix) +} + +func isShaFallback(seg string) bool { + if len(seg) < shaFallbackHexPrefixBytes+len(shaFallbackSeparator) { + return false + } + for i := 0; i < shaFallbackHexPrefixBytes; i++ { + if _, ok := unhex(seg[i]); !ok { + return false + } + } + return seg[shaFallbackHexPrefixBytes:shaFallbackHexPrefixBytes+len(shaFallbackSeparator)] == shaFallbackSeparator +} + +// isUnreserved is the RFC3986 unreserved set: ALPHA / DIGIT / "-" / "." / "_". +// "~" is excluded because it has caused interop problems with older shells and +// the additional safety is not worth the rare benefit. +func isUnreserved(c byte) bool { + switch { + case c >= 'A' && c <= 'Z': + return true + case c >= 'a' && c <= 'z': + return true + case c >= '0' && c <= '9': + return true + case c == '-', c == '.', c == '_': + return true + } + return false +} + +func hexUpper(nibble byte) byte { + if nibble < 10 { //nolint:mnd // 10 == decimal/hex boundary + return '0' + nibble + } + return 'A' + (nibble - 10) //nolint:mnd // 10 == decimal/hex boundary +} + +func unhex(c byte) (byte, bool) { + switch { + case c >= '0' && c <= '9': + return c - '0', true + case c >= 'a' && c <= 'f': + return c - 'a' + 10, true //nolint:mnd // 10 == decimal/hex boundary + case c >= 'A' && c <= 'F': + return c - 'A' + 10, true //nolint:mnd // 10 == decimal/hex boundary + } + return 0, false +} diff --git a/internal/backup/filename_test.go b/internal/backup/filename_test.go new file mode 100644 index 00000000..c0ad737f --- /dev/null +++ b/internal/backup/filename_test.go @@ -0,0 +1,327 @@ +package backup + +import ( + "crypto/rand" + "strings" + "testing" + + "github.com/cockroachdb/errors" + "pgregory.net/rapid" +) + +func TestEncodeSegment_PassthroughForUnreserved(t *testing.T) { + t.Parallel() + // Every unreserved byte must round-trip without escaping. + cases := []string{ + "", + "a", + "A", + "0", + "-", + ".", + "_", + "abc", + "ABCdef-123_test.json", + "customer-7421", + "2026-04-29T12-00-00Z", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + t.Parallel() + enc := EncodeSegment([]byte(c)) + if enc != c { + t.Fatalf("EncodeSegment(%q) = %q, want %q", c, enc, c) + } + dec, err := DecodeSegment(enc) + if err != nil { + t.Fatalf("DecodeSegment(%q) error: %v", enc, err) + } + if string(dec) != c { + t.Fatalf("round-trip: got %q, want %q", dec, c) + } + }) + } +} + +func TestEncodeSegment_PercentEscapesReservedBytes(t *testing.T) { + t.Parallel() + cases := map[string]string{ + "hello world": "hello%20world", + "a/b": "a%2Fb", + "a:b": "a%3Ab", + "key|with|pipe": "key%7Cwith%7Cpipe", + "\x00": "%00", + "\xff": "%FF", + "colon:.": "colon%3A.", + "plus+": "plus%2B", + } + for raw, want := range cases { + t.Run(want, func(t *testing.T) { + t.Parallel() + got := EncodeSegment([]byte(raw)) + if got != want { + t.Fatalf("EncodeSegment(%q) = %q, want %q", raw, got, want) + } + dec, err := DecodeSegment(got) + if err != nil { + t.Fatalf("DecodeSegment(%q) error: %v", got, err) + } + if string(dec) != raw { + t.Fatalf("round-trip: got %q, want %q", dec, raw) + } + }) + } +} + +func TestEncodeSegment_HexIsUppercase(t *testing.T) { + t.Parallel() + enc := EncodeSegment([]byte{0xab, 0xcd}) + if enc != "%AB%CD" { + t.Fatalf("EncodeSegment hex case: got %q want %q", enc, "%AB%CD") + } +} + +func TestEncodeSegment_LongInputTakesShaFallback(t *testing.T) { + t.Parallel() + // 250 bytes of unreserved chars: percent-encoded length == 250, which + // exceeds the 240-byte ceiling, so the SHA fallback fires. + raw := strings.Repeat("a", 250) + enc := EncodeSegment([]byte(raw)) + if !IsShaFallback(enc) { + t.Fatalf("EncodeSegment(250 unreserved bytes) did not take SHA fallback: %q", enc) + } + if len(enc) > maxSegmentBytes { + t.Fatalf("SHA-fallback output exceeds max: len=%d > %d", len(enc), maxSegmentBytes) + } + // Decoder reports the fallback and refuses to fabricate the original. + if _, err := DecodeSegment(enc); !errors.Is(err, ErrShaFallbackNeedsKeymap) { + t.Fatalf("DecodeSegment of SHA-fallback: err=%v want ErrShaFallbackNeedsKeymap", err) + } +} + +func TestEncodeSegment_ShortBytesThatExpandPastCeilingTakeShaFallback(t *testing.T) { + t.Parallel() + // Each byte percent-encodes to 3 chars, so 81 reserved bytes -> 243 chars. + raw := strings.Repeat("\x01", 81) + enc := EncodeSegment([]byte(raw)) + if !IsShaFallback(enc) { + t.Fatalf("expected SHA fallback for 81 reserved bytes (243 expanded), got %q", enc) + } +} + +func TestEncodeSegment_Deterministic(t *testing.T) { + t.Parallel() + // Same input must encode to the same output across calls. + raw := []byte("session:abc:123/4") + a := EncodeSegment(raw) + b := EncodeSegment(raw) + if a != b { + t.Fatalf("non-deterministic: %q != %q", a, b) + } + // Note: EncodeSegment is intentionally NOT idempotent — `%` is a reserved + // byte and a second pass percent-encodes it again. Decode-then-encode is + // the round-trip that holds, and is covered by other tests. +} + +func TestEncodeBinarySegment_BasicRoundTrip(t *testing.T) { + t.Parallel() + cases := [][]byte{ + nil, + {}, + {0x00}, + {0x01, 0x02, 0x03}, + []byte("not-binary-but-still-a-byte-string"), + {0xff, 0xfe, 0xfd, 0xfc}, + } + for _, raw := range cases { + enc := EncodeBinarySegment(raw) + if !strings.HasPrefix(enc, binaryPrefix) { + t.Fatalf("EncodeBinarySegment(%x) = %q, missing binary prefix", raw, enc) + } + if !IsBinarySegment(enc) { + t.Fatalf("IsBinarySegment(%q) = false", enc) + } + dec, err := DecodeSegment(enc) + if err != nil { + t.Fatalf("DecodeSegment(%q) error: %v", enc, err) + } + if string(dec) != string(raw) { + t.Fatalf("binary round-trip: got %x want %x", dec, raw) + } + } +} + +func TestEncodeBinarySegment_LongInputTakesShaFallback(t *testing.T) { + t.Parallel() + // base64 ~= 4/3 the raw length; raw=200 -> ~268 chars after b64 prefix. + raw := make([]byte, 200) + if _, err := rand.Read(raw); err != nil { + t.Fatalf("rand: %v", err) + } + enc := EncodeBinarySegment(raw) + if !IsShaFallback(enc) { + t.Fatalf("expected SHA fallback for 200-byte binary, got %q (len %d)", enc, len(enc)) + } +} + +func TestEncodeSegment_ShaFallbackPrefixCannotCollideWithEncodedHex(t *testing.T) { + t.Parallel() + // A user key consisting solely of unreserved hex chars and underscores + // must NOT be detected as a SHA fallback: the SHA fallback requires the + // 32-hex prefix to be followed by exactly "__" — the user input + // "abcdef...__rest" with shorter prefix or wrong separator length must + // fall through. + cases := []string{ + // 31 hex + double-underscore: too short by one + "0123456789abcdef0123456789abcde__", + // 32 hex + single underscore + "0123456789abcdef0123456789abcdef_", + // Correct prefix length but trailing single underscore: 33-char prefix, + // the "__" check at offset 32 is "_X", not "__". + "0123456789abcdef0123456789abcdef_x", + } + for _, c := range cases { + if IsShaFallback(c) { + t.Fatalf("false positive: IsShaFallback(%q) = true", c) + } + } +} + +func TestDecodeSegment_RejectsTruncatedPercentEscape(t *testing.T) { + t.Parallel() + cases := []string{ + "%", + "%1", + "abc%", + "abc%2", + } + for _, c := range cases { + if _, err := DecodeSegment(c); !errors.Is(err, ErrInvalidEncodedSegment) { + t.Fatalf("DecodeSegment(%q): err=%v want ErrInvalidEncodedSegment", c, err) + } + } +} + +func TestDecodeSegment_RejectsNonHexInPercentEscape(t *testing.T) { + t.Parallel() + cases := []string{ + "%GG", + "%1G", + "%G1", + "foo%XYbar", + } + for _, c := range cases { + if _, err := DecodeSegment(c); !errors.Is(err, ErrInvalidEncodedSegment) { + t.Fatalf("DecodeSegment(%q): err=%v want ErrInvalidEncodedSegment", c, err) + } + } +} + +func TestDecodeSegment_RejectsRawReservedBytes(t *testing.T) { + t.Parallel() + // A literal `/` or other reserved byte in an encoded segment is invalid; + // percent-encoded segments must contain only [unreserved-set | "%HH"]. + cases := []string{ + "a/b", + "a:b", + "hello world", + } + for _, c := range cases { + if _, err := DecodeSegment(c); !errors.Is(err, ErrInvalidEncodedSegment) { + t.Fatalf("DecodeSegment(%q): err=%v want ErrInvalidEncodedSegment", c, err) + } + } +} + +func TestDecodeSegment_RejectsMalformedBinary(t *testing.T) { + t.Parallel() + cases := []string{ + "b64.!!!", // "!" is not a base64url alphabet character + "b64.padding=", // RawURLEncoding does not accept padding + } + for _, c := range cases { + if _, err := DecodeSegment(c); !errors.Is(err, ErrInvalidEncodedSegment) { + t.Fatalf("DecodeSegment(%q): err=%v want ErrInvalidEncodedSegment", c, err) + } + } +} + +func TestEncodeSegment_OutputLengthBoundedByMax(t *testing.T) { + t.Parallel() + // For any input — including pathological ones that expand 3x under + // percent-encoding — the encoded output never exceeds maxSegmentBytes. + for _, n := range []int{0, 1, 240, 241, 1000, 65536} { + raw := make([]byte, n) + if _, err := rand.Read(raw); err != nil { + t.Fatalf("rand: %v", err) + } + enc := EncodeSegment(raw) + if len(enc) > maxSegmentBytes { + t.Fatalf("EncodeSegment(len=%d): output len=%d > max=%d", n, len(enc), maxSegmentBytes) + } + } +} + +func TestEncodeSegment_FuzzRoundTripIfNotShaFallback(t *testing.T) { + t.Parallel() + rapid.Check(t, func(t *rapid.T) { + raw := rapid.SliceOfN(rapid.Byte(), 0, 80).Draw(t, "raw") + enc := EncodeSegment(raw) + if IsShaFallback(enc) { + // Only assert the documented post-condition for fallback inputs: + // decode must refuse rather than fabricate. + if _, err := DecodeSegment(enc); !errors.Is(err, ErrShaFallbackNeedsKeymap) { + t.Fatalf("SHA-fallback decode did not return ErrShaFallbackNeedsKeymap: %v", err) + } + return + } + dec, err := DecodeSegment(enc) + if err != nil { + t.Fatalf("DecodeSegment(%q) error: %v (raw=%x)", enc, err, raw) + } + if string(dec) != string(raw) { + t.Fatalf("round-trip mismatch: raw=%x dec=%x enc=%q", raw, dec, enc) + } + }) +} + +func TestEncodeBinarySegment_FuzzRoundTripIfNotShaFallback(t *testing.T) { + t.Parallel() + rapid.Check(t, func(t *rapid.T) { + raw := rapid.SliceOfN(rapid.Byte(), 0, 150).Draw(t, "raw") + enc := EncodeBinarySegment(raw) + if IsShaFallback(enc) { + if _, err := DecodeSegment(enc); !errors.Is(err, ErrShaFallbackNeedsKeymap) { + t.Fatalf("SHA-fallback decode did not return ErrShaFallbackNeedsKeymap: %v", err) + } + return + } + if !IsBinarySegment(enc) { + t.Fatalf("non-fallback binary segment missing prefix: %q", enc) + } + dec, err := DecodeSegment(enc) + if err != nil { + t.Fatalf("DecodeSegment(%q) error: %v (raw=%x)", enc, err, raw) + } + if string(dec) != string(raw) { + t.Fatalf("binary round-trip mismatch: raw=%x dec=%x enc=%q", raw, dec, enc) + } + }) +} + +func TestEncodeSegment_ShaFallbackEmbedsRecognisableSuffix(t *testing.T) { + t.Parallel() + // The truncated suffix in the SHA-fallback rendering must be derivable + // from the original key, so an operator can grep the file tree for a + // known-prefix key. Use an all-letter prefix so percent-encoding leaves + // it intact (otherwise the suffix is itself percent-encoded). + prefix := "human-recognisable-prefix" + raw := []byte(prefix + strings.Repeat("a", 300)) + enc := EncodeSegment(raw) + if !IsShaFallback(enc) { + t.Fatalf("expected SHA fallback for 325-byte input") + } + if !strings.Contains(enc, prefix) { + t.Fatalf("SHA fallback %q does not contain %q", enc, prefix) + } +} diff --git a/internal/backup/keymap.go b/internal/backup/keymap.go new file mode 100644 index 00000000..f55214f7 --- /dev/null +++ b/internal/backup/keymap.go @@ -0,0 +1,199 @@ +package backup + +import ( + "bufio" + "encoding/base64" + "encoding/json" + "io" + + "github.com/cockroachdb/errors" +) + +// KEYMAP.jsonl shape (one record per line): +// +// {"encoded":"","original":"","kind":"sha-fallback"} +// +// Records are written in encounter order (the order the encoder produced +// them) and never modified after write. The file is append-only; if the same +// encoded segment is written twice the reader keeps the last entry, but the +// encoder is expected not to emit duplicates within a single dump. +// +// Records exist only for entries whose original bytes are NOT recoverable +// from the encoded filename alone: +// +// - KindSHAFallback — segment is `__` +// (filename length exceeded EncodeSegment's 240-byte ceiling). +// - KindS3LeafData — S3 object renamed to `.elastickv-leaf-data` +// because both `` and `/...` existed in the same bucket. +// - KindMetaCollision — user S3 object key happened to end in +// `.elastickv-meta.json`; renamed under --rename-collisions. +// +// A consumer that does not care about reversing these to original bytes can +// ignore KEYMAP.jsonl entirely. +const ( + KindSHAFallback = "sha-fallback" + KindS3LeafData = "s3-leaf-data" + KindMetaCollision = "meta-suffix-rename" +) + +// keymapBufSizeWriter is the bufio.Writer buffer size for the JSONL writer. +// 64 KiB amortises the per-syscall cost across hundreds of small records +// without holding pathological amounts of memory. +const keymapBufSizeWriter = 64 << 10 + +// keymapBufSizeReader bounds bufio.Scanner's per-line buffer. KEYMAP records +// carry a ~240-byte encoded segment plus a base64'd original key (which can +// itself be arbitrarily large but is bounded by the practical maximum key +// size on the source store). 1 MiB per line is generous; if a record +// genuinely exceeds it the reader returns a typed error rather than +// silently truncating. +const keymapBufSizeReader = 1 << 20 + +// ErrInvalidKeymapRecord is returned by Reader.Next when a line does not +// parse as a KeymapRecord (malformed JSON, missing field, malformed +// base64, etc.). +var ErrInvalidKeymapRecord = errors.New("backup: invalid KEYMAP.jsonl record") + +// KeymapRecord is a single mapping from encoded filename component back to +// the original key bytes. Original bytes are arbitrary (binary safe), so +// they are encoded as base64url-no-padding for transport in JSON. +type KeymapRecord struct { + // Encoded is the filename segment as it appears in the dump tree. + Encoded string `json:"encoded"` + // OriginalB64 is base64url-no-padding of the original key bytes. + OriginalB64 string `json:"original"` + // Kind classifies why this record exists; see Kind* constants. + Kind string `json:"kind"` +} + +// Original returns the decoded original key bytes from r.OriginalB64. +func (r KeymapRecord) Original() ([]byte, error) { + out, err := base64.RawURLEncoding.DecodeString(r.OriginalB64) + if err != nil { + return nil, errors.Wrap(ErrInvalidKeymapRecord, err.Error()) + } + return out, nil +} + +// KeymapWriter appends records to a KEYMAP.jsonl stream. Concurrent calls to +// Write are serialised through the underlying bufio.Writer; the caller is +// expected to use a single writer per scope. +type KeymapWriter struct { + bw *bufio.Writer + enc *json.Encoder + // count tracks how many records have been written; exposed so the caller + // can decide to omit an empty KEYMAP.jsonl file (per the spec, the file + // is omitted when no entries exist). + count int +} + +// NewKeymapWriter returns a writer that appends JSONL records to w. Close +// must be called to flush. +func NewKeymapWriter(w io.Writer) *KeymapWriter { + bw := bufio.NewWriterSize(w, keymapBufSizeWriter) + enc := json.NewEncoder(bw) + enc.SetEscapeHTML(false) // we never embed user keys in HTML; preserve `<>&` + return &KeymapWriter{bw: bw, enc: enc} +} + +// Write appends one KeymapRecord. The record is JSON-serialised with a +// trailing newline (json.Encoder behavior), giving the JSONL contract. +func (w *KeymapWriter) Write(rec KeymapRecord) error { + if rec.Encoded == "" { + return errors.WithStack(errors.New("backup: KEYMAP record encoded must be non-empty")) + } + if rec.Kind == "" { + return errors.WithStack(errors.New("backup: KEYMAP record kind must be non-empty")) + } + if err := w.enc.Encode(rec); err != nil { + return errors.WithStack(err) + } + w.count++ + return nil +} + +// WriteOriginal is a convenience wrapper that base64-encodes raw original +// bytes for the caller. +func (w *KeymapWriter) WriteOriginal(encoded string, original []byte, kind string) error { + return w.Write(KeymapRecord{ + Encoded: encoded, + OriginalB64: base64.RawURLEncoding.EncodeToString(original), + Kind: kind, + }) +} + +// Count returns the number of records written so far. Useful for the +// "omit empty KEYMAP file" decision after the dump completes. +func (w *KeymapWriter) Count() int { return w.count } + +// Close flushes any buffered records to the underlying writer. +func (w *KeymapWriter) Close() error { + if w.bw == nil { + return nil + } + if err := w.bw.Flush(); err != nil { + return errors.WithStack(err) + } + return nil +} + +// KeymapReader iterates JSONL records line-by-line. Memory footprint is +// bounded by keymapBufSizeReader regardless of file size. +type KeymapReader struct { + sc *bufio.Scanner + err error +} + +// NewKeymapReader wraps r so the caller can iterate records via Next. +func NewKeymapReader(r io.Reader) *KeymapReader { + sc := bufio.NewScanner(r) + sc.Buffer(make([]byte, 0, keymapBufSizeReader), keymapBufSizeReader) + return &KeymapReader{sc: sc} +} + +// Next decodes the next record. It returns (rec, true, nil) on success, +// (zero, false, nil) at end of stream, and (zero, false, err) on parse +// failure or I/O error. Once an error is returned the reader is sticky: +// subsequent calls return the same error. +func (r *KeymapReader) Next() (KeymapRecord, bool, error) { + if r.err != nil { + return KeymapRecord{}, false, r.err + } + if !r.sc.Scan() { + if err := r.sc.Err(); err != nil { + r.err = errors.WithStack(err) + return KeymapRecord{}, false, r.err + } + return KeymapRecord{}, false, nil + } + line := r.sc.Bytes() + var rec KeymapRecord + if err := json.Unmarshal(line, &rec); err != nil { + r.err = errors.Wrap(ErrInvalidKeymapRecord, err.Error()) + return KeymapRecord{}, false, r.err + } + if rec.Encoded == "" || rec.Kind == "" { + r.err = errors.Wrap(ErrInvalidKeymapRecord, "missing encoded or kind") + return KeymapRecord{}, false, r.err + } + return rec, true, nil +} + +// LoadKeymap reads every record from r into an in-memory map keyed by +// encoded segment. The last record wins on duplicates. Suitable for +// scopes where the keymap fits comfortably in memory; for large scopes +// callers should use KeymapReader directly. +func LoadKeymap(r io.Reader) (map[string]KeymapRecord, error) { + out := make(map[string]KeymapRecord) + rd := NewKeymapReader(r) + for { + rec, ok, err := rd.Next() + if err != nil { + return nil, err + } + if !ok { + return out, nil + } + out[rec.Encoded] = rec + } +} diff --git a/internal/backup/keymap_test.go b/internal/backup/keymap_test.go new file mode 100644 index 00000000..9e2ddcea --- /dev/null +++ b/internal/backup/keymap_test.go @@ -0,0 +1,205 @@ +package backup + +import ( + "bytes" + "strings" + "testing" + + "github.com/cockroachdb/errors" +) + +type keymapCase struct { + encoded string + original []byte + kind string +} + +func keymapRoundTripCases() []keymapCase { + return []keymapCase{ + {"abcdef0123456789abcdef0123456789__hello", []byte("hello-but-much-longer-than-fits"), KindSHAFallback}, + {"path%2Fto.elastickv-leaf-data", []byte("path/to"), KindS3LeafData}, + {"foo.elastickv-meta.json.user-data", []byte("foo.elastickv-meta.json"), KindMetaCollision}, + {"binary-key", []byte{0x00, 0xff, 0x01, 0xfe}, KindSHAFallback}, + {"empty-original", []byte{}, KindSHAFallback}, + } +} + +func writeKeymapCases(t *testing.T, w *KeymapWriter, cases []keymapCase) { + t.Helper() + for _, c := range cases { + if err := w.WriteOriginal(c.encoded, c.original, c.kind); err != nil { + t.Fatalf("Write(%q): %v", c.encoded, err) + } + } +} + +func assertKeymapRecord(t *testing.T, got map[string]KeymapRecord, c keymapCase) { + t.Helper() + rec, ok := got[c.encoded] + if !ok { + t.Fatalf("missing record for %q", c.encoded) + } + if rec.Kind != c.kind { + t.Fatalf("%q kind = %q, want %q", c.encoded, rec.Kind, c.kind) + } + orig, err := rec.Original() + if err != nil { + t.Fatalf("%q Original: %v", c.encoded, err) + } + if !bytes.Equal(orig, c.original) { + t.Fatalf("%q original = %x, want %x", c.encoded, orig, c.original) + } +} + +func TestKeymapWriter_RoundTrip(t *testing.T) { + t.Parallel() + cases := keymapRoundTripCases() + var buf bytes.Buffer + w := NewKeymapWriter(&buf) + writeKeymapCases(t, w, cases) + if w.Count() != len(cases) { + t.Fatalf("Count = %d, want %d", w.Count(), len(cases)) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + got, err := LoadKeymap(&buf) + if err != nil { + t.Fatalf("LoadKeymap: %v", err) + } + if len(got) != len(cases) { + t.Fatalf("loaded len = %d, want %d", len(got), len(cases)) + } + for _, c := range cases { + assertKeymapRecord(t, got, c) + } +} + +func TestKeymapWriter_RejectsEmptyEncoded(t *testing.T) { + t.Parallel() + w := NewKeymapWriter(&bytes.Buffer{}) + if err := w.Write(KeymapRecord{Encoded: "", Kind: KindSHAFallback}); err == nil { + t.Fatalf("expected error for empty encoded, got nil") + } + if err := w.Write(KeymapRecord{Encoded: "x", Kind: ""}); err == nil { + t.Fatalf("expected error for empty kind, got nil") + } +} + +func TestKeymapWriter_DoesNotEscapeHTML(t *testing.T) { + t.Parallel() + var buf bytes.Buffer + w := NewKeymapWriter(&buf) + // json.Encoder escapes `<`, `>`, `&` by default; we disable that so + // keys containing these bytes encode/decode without surprise. + if err := w.WriteOriginal("a%3Cb%3Ec", []byte("ac&d"), KindSHAFallback); err != nil { + t.Fatalf("WriteOriginal: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + out := buf.String() + if strings.Contains(out, `<`) || strings.Contains(out, `>`) || strings.Contains(out, `&`) { + t.Fatalf("unwanted HTML escape in output: %q", out) + } + // And the base64 of "ac&d" appears intact: + if !strings.Contains(out, "YTxiPmMmZA") { + t.Fatalf("missing base64 of original in output: %q", out) + } +} + +func TestKeymapWriter_OmitEmpty(t *testing.T) { + t.Parallel() + // The "omit when empty" decision is the caller's; the writer just + // reports whether any records were written. + var buf bytes.Buffer + w := NewKeymapWriter(&buf) + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + if w.Count() != 0 { + t.Fatalf("Count = %d, want 0", w.Count()) + } + if buf.Len() != 0 { + t.Fatalf("empty writer produced output: %q", buf.String()) + } +} + +func TestKeymapReader_RejectsMalformedJSON(t *testing.T) { + t.Parallel() + r := NewKeymapReader(strings.NewReader("not-json\n")) + _, _, err := r.Next() + if !errors.Is(err, ErrInvalidKeymapRecord) { + t.Fatalf("err = %v, want ErrInvalidKeymapRecord", err) + } + // Sticky: subsequent calls return the same wrapped error class. + _, _, err2 := r.Next() + if !errors.Is(err2, ErrInvalidKeymapRecord) { + t.Fatalf("non-sticky error: %v", err2) + } +} + +func TestKeymapReader_RejectsRecordWithoutEncodedOrKind(t *testing.T) { + t.Parallel() + cases := []string{ + `{"original":"AA"}`, + `{"encoded":"","kind":"sha-fallback"}`, + `{"encoded":"x"}`, + `{"encoded":"x","kind":""}`, + } + for _, line := range cases { + r := NewKeymapReader(strings.NewReader(line + "\n")) + _, _, err := r.Next() + if !errors.Is(err, ErrInvalidKeymapRecord) { + t.Fatalf("input %q: err = %v, want ErrInvalidKeymapRecord", line, err) + } + } +} + +func TestKeymapReader_AcceptsBlankLinesByPolicy(t *testing.T) { + t.Parallel() + // bufio.Scanner skips trailing newline but emits an empty line when one + // is in the middle of the stream. We require strict JSONL — every + // non-empty line must be a record. An empty line in the middle should + // surface as ErrInvalidKeymapRecord rather than silently skipped, so + // truncated dumps are recognised. + input := `{"encoded":"x","original":"AA","kind":"sha-fallback"}` + "\n\n" + + `{"encoded":"y","original":"AA","kind":"sha-fallback"}` + "\n" + r := NewKeymapReader(strings.NewReader(input)) + if _, ok, err := r.Next(); !ok || err != nil { + t.Fatalf("first record: ok=%v err=%v", ok, err) + } + if _, _, err := r.Next(); !errors.Is(err, ErrInvalidKeymapRecord) { + t.Fatalf("blank line: err=%v want ErrInvalidKeymapRecord", err) + } +} + +func TestLoadKeymap_LastRecordWins(t *testing.T) { + t.Parallel() + input := `{"encoded":"x","original":"YQ","kind":"sha-fallback"}` + "\n" + + `{"encoded":"x","original":"Yg","kind":"sha-fallback"}` + "\n" + got, err := LoadKeymap(strings.NewReader(input)) + if err != nil { + t.Fatalf("LoadKeymap: %v", err) + } + rec, ok := got["x"] + if !ok { + t.Fatalf("missing record") + } + orig, err := rec.Original() + if err != nil { + t.Fatalf("Original: %v", err) + } + if string(orig) != "b" { + t.Fatalf("last-wins broken: got %q want %q", orig, "b") + } +} + +func TestKeymapRecord_OriginalRejectsBadBase64(t *testing.T) { + t.Parallel() + rec := KeymapRecord{Encoded: "x", OriginalB64: "!!!", Kind: KindSHAFallback} + if _, err := rec.Original(); !errors.Is(err, ErrInvalidKeymapRecord) { + t.Fatalf("err = %v, want ErrInvalidKeymapRecord", err) + } +} diff --git a/internal/backup/manifest.go b/internal/backup/manifest.go new file mode 100644 index 00000000..47d6df02 --- /dev/null +++ b/internal/backup/manifest.go @@ -0,0 +1,275 @@ +package backup + +import ( + "encoding/json" + "io" + "time" + + "github.com/cockroachdb/errors" +) + +// MANIFEST.json is the only file a restore tool must read first. All other +// files in a dump are decoded from their on-disk path and contents. The +// manifest records: +// +// - format_version (the only field a restore tool MUST consult before +// trusting anything else) +// - phase ("phase0-snapshot-decode" or "phase1-live-pinned") so a +// consumer that cares about cross-shard PIT consistency can warn or +// refuse on Phase 0 inputs +// - source/origin metadata so a restore is auditable +// - exclusion flags + format-policy fields so the producer's rendering +// choices are explicit at restore time + +// CurrentFormatVersion is the format major-version this code emits and +// accepts. Restore-side code MUST refuse `format_version > current`. A +// minor-version bump (e.g., adding optional fields) does not change this +// constant. +const CurrentFormatVersion uint32 = 1 + +const ( + // PhasePhase0SnapshotDecode marks dumps produced by Phase 0a (offline + // snapshot decoder). + PhasePhase0SnapshotDecode = "phase0-snapshot-decode" + // PhasePhase1LivePinned marks dumps produced by Phase 1 (live PIT + // extraction with cluster-wide read_ts pinning). + PhasePhase1LivePinned = "phase1-live-pinned" +) + +const ( + // ChecksumAlgorithmSHA256 is the only checksum algorithm Phase 0a writes. + // Phase 1 may add others later (e.g. blake3) under the same field. + ChecksumAlgorithmSHA256 = "sha256" + // ChecksumFormatSha256sum identifies the line-oriented sha256sum(1) + // format used by the CHECKSUMS file. Operators verify with + // `sha256sum -c CHECKSUMS` from the dump root. + ChecksumFormatSha256sum = "sha256sum" + // EncodedFilenameCharsetRFC3986 is the EncodeSegment charset used for + // every non-S3-object filename in the dump. + EncodedFilenameCharsetRFC3986 = "rfc3986-unreserved-plus-percent" + // S3MetaSuffixDefault is the reserved suffix for the S3 sidecar + // metadata file (`.elastickv-meta.json`). + S3MetaSuffixDefault = ".elastickv-meta.json" + // S3CollisionStrategyLeafDataSuffix renames the shorter of two + // colliding S3 keys to `.elastickv-leaf-data` and records the + // rename in KEYMAP.jsonl. + S3CollisionStrategyLeafDataSuffix = "leaf-data-suffix" + // DynamoDBLayoutPerItem emits one item per file + // (`items//.json`); the user's stated default. + DynamoDBLayoutPerItem = "per-item" + // DynamoDBLayoutJSONL bundles items into `items/data-.jsonl` + // (opt-in via --dynamodb-bundle-mode jsonl). + DynamoDBLayoutJSONL = "jsonl" + // KeySegmentMaxBytesDefault matches EncodeSegment's maxSegmentBytes. + KeySegmentMaxBytesDefault uint32 = 240 +) + +// Source records where a Phase 0 dump came from. Phase 1 dumps leave Source +// nil and populate Live instead. +type Source struct { + // FSMPath is the absolute or relative path of the .fsm file the + // decoder consumed. + FSMPath string `json:"fsm_path"` + // FSMCRC32C is the CRC32C value the decoder verified against the + // .fsm file's footer (lowercase hex). + FSMCRC32C string `json:"fsm_crc32c,omitempty"` +} + +// Live records the cluster-wide pinning information that produced a Phase 1 +// dump. Phase 0 dumps leave this nil. +type Live struct { + // ReadTS is the pinned read_ts at which BackupScanner traversed the + // keyspace. + ReadTS uint64 `json:"read_ts"` + // PinTokenSHA256 is the hex SHA-256 of the pin_token issued by + // BeginBackup. Stored as a hash rather than the raw token so the + // manifest carries no auth-sensitive material. + PinTokenSHA256 string `json:"pin_token_sha256,omitempty"` +} + +// Adapters lists which scopes were dumped per adapter. An empty slice +// means "no scopes for this adapter were dumped"; a nil slice means +// "this adapter was not in the dump's scope filter." +type Adapters struct { + DynamoDB Adapter `json:"dynamodb"` + S3 Adapter `json:"s3"` + Redis Adapter `json:"redis"` + SQS Adapter `json:"sqs"` +} + +// Adapter holds the scope identifiers for one adapter. Field names are +// per-adapter to match the protocol's natural vocabulary. +type Adapter struct { + Tables []string `json:"tables,omitempty"` + Buckets []string `json:"buckets,omitempty"` + Databases []uint32 `json:"databases,omitempty"` + Queues []string `json:"queues,omitempty"` +} + +// Exclusions records the producer-side flags that affected which records +// were emitted. Restore tools log these so an operator can correlate a +// surprising dump shape with the producer invocation. +type Exclusions struct { + IncludeIncompleteUploads bool `json:"include_incomplete_uploads"` + IncludeOrphans bool `json:"include_orphans"` + PreserveSQSVisibility bool `json:"preserve_sqs_visibility"` + IncludeSQSSideRecords bool `json:"include_sqs_side_records"` +} + +// Manifest is the on-disk MANIFEST.json structure. Field tags match the +// spec in docs/design/2026_04_29_proposed_snapshot_logical_decoder.md. +type Manifest struct { + FormatVersion uint32 `json:"format_version"` + Phase string `json:"phase"` + ElastickvVersion string `json:"elastickv_version,omitempty"` + ClusterID string `json:"cluster_id,omitempty"` + SnapshotIndex uint64 `json:"snapshot_index,omitempty"` + LastCommitTS uint64 `json:"last_commit_ts,omitempty"` + WallTimeISO string `json:"wall_time_iso"` + Source *Source `json:"source,omitempty"` + Live *Live `json:"live,omitempty"` + Adapters Adapters `json:"adapters"` + Exclusions Exclusions `json:"exclusions"` + ChecksumAlgorithm string `json:"checksum_algorithm"` + ChecksumFormat string `json:"checksum_format"` + + EncodedFilenameCharset string `json:"encoded_filename_charset"` + KeySegmentMaxBytes uint32 `json:"key_segment_max_bytes"` + S3MetaSuffix string `json:"s3_meta_suffix"` + S3CollisionStrategy string `json:"s3_collision_strategy"` + DynamoDBLayout string `json:"dynamodb_layout"` +} + +// ErrUnsupportedFormatVersion is returned by ReadManifest when the on-disk +// format_version is greater than CurrentFormatVersion or zero. +var ErrUnsupportedFormatVersion = errors.New("backup: manifest format_version unsupported") + +// ErrInvalidManifest is returned by ReadManifest when the JSON parses but +// fails structural validation (missing required field, unknown phase, etc.). +var ErrInvalidManifest = errors.New("backup: manifest invalid") + +// NewPhase0SnapshotManifest seeds a manifest with the Phase 0a defaults. +// Callers fill in scope (Adapters), Source/wall time and exclusions before +// passing it to WriteManifest. +func NewPhase0SnapshotManifest(now time.Time) Manifest { + return Manifest{ + FormatVersion: CurrentFormatVersion, + Phase: PhasePhase0SnapshotDecode, + WallTimeISO: now.UTC().Format(time.RFC3339Nano), + ChecksumAlgorithm: ChecksumAlgorithmSHA256, + ChecksumFormat: ChecksumFormatSha256sum, + EncodedFilenameCharset: EncodedFilenameCharsetRFC3986, + KeySegmentMaxBytes: KeySegmentMaxBytesDefault, + S3MetaSuffix: S3MetaSuffixDefault, + S3CollisionStrategy: S3CollisionStrategyLeafDataSuffix, + DynamoDBLayout: DynamoDBLayoutPerItem, + } +} + +// WriteManifest serialises m as pretty-printed JSON to w. +// +// Pretty-printing is deliberate — MANIFEST.json is operator-facing and is +// expected to be `cat`-ed and `jq`-ed during incident response. +func WriteManifest(w io.Writer, m Manifest) error { + if err := m.validate(); err != nil { + return err + } + enc := json.NewEncoder(w) + enc.SetIndent("", " ") //nolint:mnd // 2-space indent matches `jq -.` default + enc.SetEscapeHTML(false) + if err := enc.Encode(m); err != nil { + return errors.WithStack(err) + } + return nil +} + +// ReadManifest decodes and validates a MANIFEST.json from r. The returned +// error is wrapped as ErrUnsupportedFormatVersion or ErrInvalidManifest so +// callers can branch on errors.Is. +func ReadManifest(r io.Reader) (Manifest, error) { + var m Manifest + dec := json.NewDecoder(r) + dec.DisallowUnknownFields() // surface format drift loudly + if err := dec.Decode(&m); err != nil { + return Manifest{}, errors.Wrap(ErrInvalidManifest, err.Error()) + } + if m.FormatVersion == 0 { + return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion, + "format_version is zero") + } + if m.FormatVersion > CurrentFormatVersion { + return Manifest{}, errors.Wrapf(ErrUnsupportedFormatVersion, + "format_version %d > current %d (newer producer)", m.FormatVersion, CurrentFormatVersion) + } + if err := m.validate(); err != nil { + return Manifest{}, err + } + return m, nil +} + +func (m Manifest) validate() error { + if err := m.validateRequiredFields(); err != nil { + return err + } + if err := m.validatePolicyFields(); err != nil { + return err + } + return m.validatePhaseSpecific() +} + +func (m Manifest) validateRequiredFields() error { + if m.FormatVersion == 0 { + return errors.Wrap(ErrInvalidManifest, "format_version is zero") + } + switch m.Phase { + case PhasePhase0SnapshotDecode, PhasePhase1LivePinned: + default: + return errors.Wrapf(ErrInvalidManifest, "unknown phase %q", m.Phase) + } + if m.WallTimeISO == "" { + return errors.Wrap(ErrInvalidManifest, "wall_time_iso missing") + } + if _, err := time.Parse(time.RFC3339Nano, m.WallTimeISO); err != nil { + return errors.Wrapf(ErrInvalidManifest, "wall_time_iso unparseable: %v", err) + } + return nil +} + +func (m Manifest) validatePolicyFields() error { + if m.ChecksumAlgorithm == "" { + return errors.Wrap(ErrInvalidManifest, "checksum_algorithm missing") + } + if m.ChecksumFormat == "" { + return errors.Wrap(ErrInvalidManifest, "checksum_format missing") + } + if m.EncodedFilenameCharset == "" { + return errors.Wrap(ErrInvalidManifest, "encoded_filename_charset missing") + } + if m.KeySegmentMaxBytes == 0 { + return errors.Wrap(ErrInvalidManifest, "key_segment_max_bytes is zero") + } + if m.S3MetaSuffix == "" { + return errors.Wrap(ErrInvalidManifest, "s3_meta_suffix missing") + } + if m.S3CollisionStrategy == "" { + return errors.Wrap(ErrInvalidManifest, "s3_collision_strategy missing") + } + if m.DynamoDBLayout != DynamoDBLayoutPerItem && m.DynamoDBLayout != DynamoDBLayoutJSONL { + return errors.Wrapf(ErrInvalidManifest, "dynamodb_layout %q unsupported", m.DynamoDBLayout) + } + return nil +} + +func (m Manifest) validatePhaseSpecific() error { + switch m.Phase { + case PhasePhase0SnapshotDecode: + if m.Live != nil { + return errors.Wrap(ErrInvalidManifest, "phase0 must not set live") + } + case PhasePhase1LivePinned: + if m.Source != nil { + return errors.Wrap(ErrInvalidManifest, "phase1 must not set source") + } + } + return nil +} diff --git a/internal/backup/manifest_test.go b/internal/backup/manifest_test.go new file mode 100644 index 00000000..386a542e --- /dev/null +++ b/internal/backup/manifest_test.go @@ -0,0 +1,220 @@ +package backup + +import ( + "bytes" + "encoding/json" + "strings" + "testing" + "time" + + "github.com/cockroachdb/errors" +) + +func TestManifest_Phase0RoundTrip(t *testing.T) { + t.Parallel() + now := time.Date(2026, 4, 29, 15, 42, 11, 94_000_000, time.UTC) + m := NewPhase0SnapshotManifest(now) + m.ElastickvVersion = "v1.7.3" + m.ClusterID = "ek-prod-us-east-1" + m.SnapshotIndex = 18432021 + m.LastCommitTS = 4517352099840000 + m.Source = &Source{FSMPath: "/data/fsm-snap/0000000000000064.fsm", FSMCRC32C: "deadbeef"} + m.Adapters = Adapters{ + DynamoDB: Adapter{Tables: []string{"orders", "users"}}, + S3: Adapter{Buckets: []string{"photos"}}, + Redis: Adapter{Databases: []uint32{0}}, + SQS: Adapter{Queues: []string{"orders-fifo.fifo"}}, + } + m.Exclusions = Exclusions{} // all defaults + + var buf bytes.Buffer + if err := WriteManifest(&buf, m); err != nil { + t.Fatalf("WriteManifest: %v", err) + } + + got, err := ReadManifest(&buf) + if err != nil { + t.Fatalf("ReadManifest: %v", err) + } + if got.Phase != PhasePhase0SnapshotDecode { + t.Fatalf("Phase = %q, want %q", got.Phase, PhasePhase0SnapshotDecode) + } + if got.SnapshotIndex != m.SnapshotIndex { + t.Fatalf("SnapshotIndex = %d, want %d", got.SnapshotIndex, m.SnapshotIndex) + } + if got.Source == nil || got.Source.FSMPath != m.Source.FSMPath { + t.Fatalf("Source.FSMPath = %v, want %v", got.Source, m.Source) + } + if got.Live != nil { + t.Fatalf("phase0 manifest must not set Live, got %+v", got.Live) + } +} + +func TestManifest_Phase1MustNotSetSource(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + m.Phase = PhasePhase1LivePinned + m.Source = &Source{FSMPath: "ignored"} + var buf bytes.Buffer + err := WriteManifest(&buf, m) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("WriteManifest err=%v want ErrInvalidManifest", err) + } +} + +func TestManifest_Phase0MustNotSetLive(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + m.Live = &Live{ReadTS: 12345} + var buf bytes.Buffer + err := WriteManifest(&buf, m) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("WriteManifest err=%v want ErrInvalidManifest", err) + } +} + +func TestReadManifest_RejectsFutureFormatVersion(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + m.FormatVersion = CurrentFormatVersion + 1 + // validate() runs before encoding, so go around it. + body, _ := json.Marshal(m) + _, err := ReadManifest(bytes.NewReader(body)) + if !errors.Is(err, ErrUnsupportedFormatVersion) { + t.Fatalf("err=%v want ErrUnsupportedFormatVersion", err) + } +} + +func TestReadManifest_RejectsZeroFormatVersion(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + m.FormatVersion = 0 + body, _ := json.Marshal(m) + _, err := ReadManifest(bytes.NewReader(body)) + if !errors.Is(err, ErrUnsupportedFormatVersion) { + t.Fatalf("err=%v want ErrUnsupportedFormatVersion", err) + } +} + +func TestReadManifest_RejectsUnknownFields(t *testing.T) { + t.Parallel() + // Format drift safety: an unknown field surfaces loudly rather than + // being silently ignored. + body := `{ + "format_version": 1, + "phase": "phase0-snapshot-decode", + "wall_time_iso": "2026-04-29T00:00:00Z", + "adapters": {"dynamodb":{}, "s3":{}, "redis":{}, "sqs":{}}, + "exclusions": {"include_incomplete_uploads":false,"include_orphans":false,"preserve_sqs_visibility":false,"include_sqs_side_records":false}, + "checksum_algorithm": "sha256", + "checksum_format": "sha256sum", + "encoded_filename_charset": "rfc3986-unreserved-plus-percent", + "key_segment_max_bytes": 240, + "s3_meta_suffix": ".elastickv-meta.json", + "s3_collision_strategy": "leaf-data-suffix", + "dynamodb_layout": "per-item", + "unknown_field": "ahoy" + }` + _, err := ReadManifest(strings.NewReader(body)) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("err=%v want ErrInvalidManifest", err) + } +} + +func TestReadManifest_RejectsUnknownPhase(t *testing.T) { + t.Parallel() + body := `{ + "format_version": 1, + "phase": "phase99-future", + "wall_time_iso": "2026-04-29T00:00:00Z", + "adapters": {"dynamodb":{}, "s3":{}, "redis":{}, "sqs":{}}, + "exclusions": {"include_incomplete_uploads":false,"include_orphans":false,"preserve_sqs_visibility":false,"include_sqs_side_records":false}, + "checksum_algorithm": "sha256", + "checksum_format": "sha256sum", + "encoded_filename_charset": "rfc3986-unreserved-plus-percent", + "key_segment_max_bytes": 240, + "s3_meta_suffix": ".elastickv-meta.json", + "s3_collision_strategy": "leaf-data-suffix", + "dynamodb_layout": "per-item" + }` + _, err := ReadManifest(strings.NewReader(body)) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("err=%v want ErrInvalidManifest", err) + } +} + +func TestReadManifest_RejectsBadWallTime(t *testing.T) { + t.Parallel() + body := `{ + "format_version": 1, + "phase": "phase0-snapshot-decode", + "wall_time_iso": "not-a-date", + "adapters": {"dynamodb":{}, "s3":{}, "redis":{}, "sqs":{}}, + "exclusions": {"include_incomplete_uploads":false,"include_orphans":false,"preserve_sqs_visibility":false,"include_sqs_side_records":false}, + "checksum_algorithm": "sha256", + "checksum_format": "sha256sum", + "encoded_filename_charset": "rfc3986-unreserved-plus-percent", + "key_segment_max_bytes": 240, + "s3_meta_suffix": ".elastickv-meta.json", + "s3_collision_strategy": "leaf-data-suffix", + "dynamodb_layout": "per-item" + }` + _, err := ReadManifest(strings.NewReader(body)) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("err=%v want ErrInvalidManifest", err) + } +} + +func TestReadManifest_RejectsUnsupportedDynamoDBLayout(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + m.DynamoDBLayout = "bogus" + body, _ := json.Marshal(m) + _, err := ReadManifest(bytes.NewReader(body)) + if !errors.Is(err, ErrInvalidManifest) { + t.Fatalf("err=%v want ErrInvalidManifest", err) + } +} + +func TestNewPhase0SnapshotManifest_DefaultsArePopulated(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + if m.FormatVersion != CurrentFormatVersion { + t.Fatalf("FormatVersion = %d, want %d", m.FormatVersion, CurrentFormatVersion) + } + if m.Phase != PhasePhase0SnapshotDecode { + t.Fatalf("Phase = %q, want %q", m.Phase, PhasePhase0SnapshotDecode) + } + if m.ChecksumAlgorithm != ChecksumAlgorithmSHA256 { + t.Fatalf("ChecksumAlgorithm = %q, want %q", m.ChecksumAlgorithm, ChecksumAlgorithmSHA256) + } + if m.ChecksumFormat != ChecksumFormatSha256sum { + t.Fatalf("ChecksumFormat = %q, want %q", m.ChecksumFormat, ChecksumFormatSha256sum) + } + if m.S3MetaSuffix != S3MetaSuffixDefault { + t.Fatalf("S3MetaSuffix = %q", m.S3MetaSuffix) + } + if m.S3CollisionStrategy != S3CollisionStrategyLeafDataSuffix { + t.Fatalf("S3CollisionStrategy = %q", m.S3CollisionStrategy) + } + if m.DynamoDBLayout != DynamoDBLayoutPerItem { + t.Fatalf("DynamoDBLayout = %q", m.DynamoDBLayout) + } + if m.KeySegmentMaxBytes != KeySegmentMaxBytesDefault { + t.Fatalf("KeySegmentMaxBytes = %d, want %d", m.KeySegmentMaxBytes, KeySegmentMaxBytesDefault) + } +} + +func TestWriteManifest_ProducesPrettyJSON(t *testing.T) { + t.Parallel() + m := NewPhase0SnapshotManifest(time.Now()) + var buf bytes.Buffer + if err := WriteManifest(&buf, m); err != nil { + t.Fatalf("WriteManifest: %v", err) + } + out := buf.String() + // Pretty: contains newlines and the 2-space indent we configured. + if !strings.Contains(out, "\n \"format_version\"") { + t.Fatalf("expected pretty 2-space indent in output:\n%s", out) + } +} diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go new file mode 100644 index 00000000..bd47c4b7 --- /dev/null +++ b/internal/backup/redis_string.go @@ -0,0 +1,383 @@ +package backup + +import ( + "bufio" + "bytes" + "encoding/binary" + "encoding/json" + "errors" + "io" + "math" + "os" + "path/filepath" + + cockroachdberr "github.com/cockroachdb/errors" +) + +// Redis simple-type encoders translate raw snapshot key/value records into +// the per-adapter directory tree defined by Phase 0 +// (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md). This file +// covers the three "simple" Redis prefixes — strings, HLLs, and TTL scan +// index entries — that always map to ONE snapshot record per user key and +// therefore need no cross-record assembly. +// +// Hash / list / set / zset / stream prefixes carry user keys spread across +// multiple wide-column rows and ship in a follow-up PR. + +// Snapshot key prefixes the encoder dispatches on. Kept in sync with +// adapter/redis_compat_types.go so a renamed prefix in the live code is +// caught here at compile time via the corresponding tests. +const ( + RedisStringPrefix = "!redis|str|" + RedisHLLPrefix = "!redis|hll|" + RedisTTLPrefix = "!redis|ttl|" + + // redisStrMagic / redisStrVersion / redisStrHasTTL / redisStrBaseHeader + // mirror adapter/redis_compat_types.go:20-24. Re-defined here rather + // than imported because the backup package is intentionally adapter- + // independent (it must run as an offline tool with no live cluster). + redisStrMagic byte = 0xFF + redisStrVersion byte = 0x01 + redisStrHasTTL byte = 0x01 + redisStrBaseHeader = 3 + redisUint64Bytes = 8 + + redisStringsTTLFile = "strings_ttl.jsonl" + redisHLLTTLFile = "hll_ttl.jsonl" + + // redisJSONLBufSize is the bufio.Writer buffer for the per-database + // TTL sidecar files. The same 64 KiB tuning as KeymapWriter — large + // enough to amortise per-syscall cost across thousands of TTL records. + redisJSONLBufSize = 64 << 10 +) + +// ErrRedisInvalidStringValue is returned when a !redis|str| value uses the +// new magic-prefix format but its declared TTL section is truncated. Legacy +// (no-magic) values are accepted as opaque raw bytes. +var ErrRedisInvalidStringValue = cockroachdberr.New("backup: invalid !redis|str| value") + +// ErrRedisInvalidTTLValue is returned when a !redis|ttl| value is not the +// expected 8-byte big-endian uint64 millisecond expiry. +var ErrRedisInvalidTTLValue = cockroachdberr.New("backup: invalid !redis|ttl| value") + +// redisKeyKind tracks which Redis-type prefix introduced a user key, so that +// when a later !redis|ttl| record arrives we know whether to write its +// expiry into strings_ttl.jsonl, hll_ttl.jsonl, or buffer it for a wide- +// column type (hash/list/set/zset/stream). +type redisKeyKind uint8 + +const ( + redisKindUnknown redisKeyKind = iota + redisKindString + redisKindHLL +) + +// RedisDB encodes one logical Redis database (`redis/db_/`). All +// operations are scoped to its outRoot; the caller wires per-database +// instances when the producer supports multiple databases (today only +// db_0 is meaningful). +// +// Lifecycle: +// +// r := NewRedisDB(outRoot) +// for each snapshot record matching a redis prefix: r.Handle*(...) +// r.Finalize() +// +// Handle* methods are NOT goroutine-safe; the decoder pipeline is +// inherently sequential per scope, so a mutex would only add cost. +type RedisDB struct { + outRoot string + + // kindByKey records the Redis type each user key was first seen as. + // Populated by HandleString and HandleHLL; consulted by HandleTTL. + // Sized for typical clusters (millions of keys × ~50 bytes each is + // affordable on the dump host); a follow-up PR introducing the + // wide-column types may switch to a streamed approach if profiling + // shows this is the binding cost. + kindByKey map[string]redisKeyKind + + // stringsTTL / hllTTL are lazily opened on first write. Per the spec, + // empty sidecar files are omitted from the dump. + stringsTTL *jsonlFile + hllTTL *jsonlFile + + // pendingWideColumnTTL accumulates !redis|ttl| records whose user key + // has not been claimed by HandleString / HandleHLL. These are + // candidates for hashes/lists/sets/zsets/streams (handled in a + // follow-up PR) — for now Finalize logs them via the warning hook + // rather than dropping silently. + pendingWideColumnTTL []redisTTLPending + + // warn is the structured-warning sink. Non-nil in production + // (fed by the decoder driver); nil in tests if the test does not + // care about warnings. + warn func(event string, fields ...any) +} + +type redisTTLPending struct { + UserKey []byte + ExpireAtMs uint64 +} + +// NewRedisDB constructs a RedisDB rooted at /redis/db_/. The +// caller is responsible for choosing ; today only 0 is meaningful. +func NewRedisDB(outRoot string) *RedisDB { + return &RedisDB{ + outRoot: outRoot, + kindByKey: make(map[string]redisKeyKind), + } +} + +// WithWarnSink wires a structured-warning sink. The sink is called with +// stable event names ("redis_orphan_ttl", etc.) and key=value pairs. +func (r *RedisDB) WithWarnSink(fn func(event string, fields ...any)) *RedisDB { + r.warn = fn + return r +} + +// HandleString processes one !redis|str| record. The value is the +// raw stored bytes; HandleString peels the magic-prefix TTL header (if +// present) and writes the user-visible value to strings/.bin and +// the TTL — if any — to strings_ttl.jsonl. +func (r *RedisDB) HandleString(userKey, value []byte) error { + r.kindByKey[string(userKey)] = redisKindString + userValue, expireAtMs, err := decodeRedisStringValue(value) + if err != nil { + return err + } + if err := r.writeBlob("strings", userKey, userValue); err != nil { + return err + } + if expireAtMs == 0 { + return nil + } + return r.appendTTL(&r.stringsTTL, redisStringsTTLFile, userKey, expireAtMs) +} + +// HandleHLL processes one !redis|hll| record. The value is the +// raw HLL sketch bytes, written byte-for-byte to hll/.bin. TTL +// for HLL keys lives in !redis|ttl| and is consumed by +// HandleTTL. +func (r *RedisDB) HandleHLL(userKey, value []byte) error { + r.kindByKey[string(userKey)] = redisKindHLL + return r.writeBlob("hll", userKey, value) +} + +// HandleTTL processes one !redis|ttl| record. Routing depends on +// what HandleString/HandleHLL recorded for the same userKey: +// +// - redisKindHLL -> hll_ttl.jsonl +// - redisKindString -> strings_ttl.jsonl (legacy strings, whose TTL +// lives in !redis|ttl| rather than the inline magic-prefix header) +// - redisKindUnknown -> buffered as pendingWideColumnTTL; reported via +// the warn sink on Finalize because Phase 0a's wide-column encoders +// have not landed yet. +func (r *RedisDB) HandleTTL(userKey, value []byte) error { + expireAtMs, err := decodeRedisTTLValue(value) + if err != nil { + return err + } + switch r.kindByKey[string(userKey)] { + case redisKindHLL: + return r.appendTTL(&r.hllTTL, redisHLLTTLFile, userKey, expireAtMs) + case redisKindString: + return r.appendTTL(&r.stringsTTL, redisStringsTTLFile, userKey, expireAtMs) + case redisKindUnknown: + r.pendingWideColumnTTL = append(r.pendingWideColumnTTL, redisTTLPending{ + UserKey: bytes.Clone(userKey), + ExpireAtMs: expireAtMs, + }) + return nil + } + return nil +} + +// Finalize flushes all open sidecar writers and emits warnings for any +// pending TTL records whose user key was never claimed by the wide-column +// encoders. Call exactly once after every snapshot record has been +// dispatched. +func (r *RedisDB) Finalize() error { + var firstErr error + if err := closeJSONL(r.stringsTTL); err != nil && firstErr == nil { + firstErr = err + } + if err := closeJSONL(r.hllTTL); err != nil && firstErr == nil { + firstErr = err + } + if r.warn != nil && len(r.pendingWideColumnTTL) > 0 { + r.warn("redis_orphan_ttl", + "count", len(r.pendingWideColumnTTL), + "hint", "wide-column type encoders (hash/list/set/zset/stream) have not landed yet") + } + return firstErr +} + +func (r *RedisDB) writeBlob(subdir string, userKey, value []byte) error { + encoded := EncodeSegment(userKey) + dir := filepath.Join(r.outRoot, "redis", "db_0", subdir) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return cockroachdberr.WithStack(err) + } + path := filepath.Join(dir, encoded+".bin") + if err := writeFileAtomic(path, value); err != nil { + return cockroachdberr.WithStack(err) + } + return nil +} + +func (r *RedisDB) appendTTL(slot **jsonlFile, baseName string, userKey []byte, expireAtMs uint64) error { + if *slot == nil { + f, err := openJSONL(filepath.Join(r.outRoot, "redis", "db_0", baseName)) + if err != nil { + return err + } + *slot = f + } + rec := struct { + Key string `json:"key"` + ExpireAtMs uint64 `json:"expire_at_ms"` + }{ + Key: EncodeSegment(userKey), + ExpireAtMs: expireAtMs, + } + if err := (*slot).enc.Encode(rec); err != nil { + return cockroachdberr.WithStack(err) + } + return nil +} + +// decodeRedisStringValue strips the redis-string magic-prefix TTL header +// (if present) from a !redis|str| value and returns (userValue, +// expireAtMs). expireAtMs == 0 means "no inline TTL"; legacy values +// always return 0 here because their TTL lives in !redis|ttl|. +func decodeRedisStringValue(value []byte) ([]byte, uint64, error) { + if !isNewRedisStrFormat(value) { + return value, 0, nil + } + if len(value) < redisStrBaseHeader { + return nil, 0, cockroachdberr.Wrap(ErrRedisInvalidStringValue, "header truncated") + } + flags := value[2] + rest := value[redisStrBaseHeader:] + if flags&redisStrHasTTL == 0 { + return rest, 0, nil + } + if len(rest) < redisUint64Bytes { + return nil, 0, cockroachdberr.Wrap(ErrRedisInvalidStringValue, "ttl section truncated") + } + rawMs := binary.BigEndian.Uint64(rest[:redisUint64Bytes]) + expireAtMs := rawMs + if expireAtMs > math.MaxInt64 { + expireAtMs = math.MaxInt64 // mirror live decoder's clamp + } + return rest[redisUint64Bytes:], expireAtMs, nil +} + +func isNewRedisStrFormat(raw []byte) bool { + return len(raw) >= 2 && //nolint:mnd // 2 == magic + version length + raw[0] == redisStrMagic && raw[1] == redisStrVersion +} + +func decodeRedisTTLValue(raw []byte) (uint64, error) { + if len(raw) != redisUint64Bytes { + return 0, cockroachdberr.Wrapf(ErrRedisInvalidTTLValue, + "length %d != %d", len(raw), redisUint64Bytes) + } + v := binary.BigEndian.Uint64(raw) + if v > math.MaxInt64 { + v = math.MaxInt64 + } + return v, nil +} + +// jsonlFile bundles a file handle and its bufio writer so callers can +// `f.enc.Encode(rec)` without re-creating the encoder per write. +type jsonlFile struct { + f *os.File + bw *bufio.Writer + enc *json.Encoder +} + +func openJSONL(path string) (*jsonlFile, error) { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return nil, cockroachdberr.WithStack(err) + } + f, err := os.Create(path) //nolint:gosec // path is composed from output-root + fixed file name + if err != nil { + return nil, cockroachdberr.WithStack(err) + } + bw := bufio.NewWriterSize(f, redisJSONLBufSize) + enc := json.NewEncoder(bw) + enc.SetEscapeHTML(false) + return &jsonlFile{f: f, bw: bw, enc: enc}, nil +} + +func closeJSONL(jl *jsonlFile) error { + if jl == nil { + return nil + } + flushErr := jl.bw.Flush() + closeErr := jl.f.Close() + switch { + case flushErr != nil: + return cockroachdberr.WithStack(flushErr) + case closeErr != nil: + return cockroachdberr.WithStack(closeErr) + } + return nil +} + +// writeFileAtomic writes data to path via a tmp+rename so a crash +// mid-write never leaves a partial file. Symbolic links are not followed +// (os.Create truncates a symlink target rather than the link itself; we +// reject symlinks explicitly). +func writeFileAtomic(path string, data []byte) error { + if info, err := os.Lstat(path); err == nil && info.Mode()&os.ModeSymlink != 0 { + return cockroachdberr.WithStack(cockroachdberr.Newf("backup: refusing to overwrite symlink at %s", path)) + } + dir := filepath.Dir(path) + tmp, err := os.CreateTemp(dir, ".bin.tmp-*") + if err != nil { + return cockroachdberr.WithStack(err) + } + tmpPath := tmp.Name() + defer func() { + // Best-effort cleanup if Rename did not consume tmpPath. + if _, statErr := os.Stat(tmpPath); statErr == nil { + _ = os.Remove(tmpPath) + } + }() + if _, err := tmp.Write(data); err != nil { + _ = tmp.Close() + return cockroachdberr.WithStack(err) + } + if err := tmp.Close(); err != nil { + return cockroachdberr.WithStack(err) + } + if err := os.Rename(tmpPath, path); err != nil { + return cockroachdberr.WithStack(err) + } + return nil +} + +// HasInlineTTL reports whether a !redis|str| value carries the new-format +// inline TTL header. Useful for tests asserting the producer's choice. +func HasInlineTTL(value []byte) bool { + if !isNewRedisStrFormat(value) || len(value) < redisStrBaseHeader { + return false + } + return value[2]&redisStrHasTTL != 0 +} + +// IsBlobAtomicWriteRetriable reports whether err from writeFileAtomic is +// a retriable I/O failure (no-space, transient FS error). Today this is a +// stub that returns false for any error; exposed so the master decoder +// loop can decide whether to abort the whole dump on encountering one. +func IsBlobAtomicWriteRetriable(err error) bool { + if err == nil { + return false + } + // errors.Is handles wrapped paths; both sentinel checks are stable + // for now because we never wrap them ourselves. + return errors.Is(err, io.ErrShortWrite) +} diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go new file mode 100644 index 00000000..bfcd81f6 --- /dev/null +++ b/internal/backup/redis_string_test.go @@ -0,0 +1,293 @@ +package backup + +import ( + "bufio" + "encoding/binary" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/cockroachdb/errors" +) + +// fixedExpireMs is a 2026-04-29 00:00:00Z epoch-ms used in fixtures so the +// asserted values do not drift with wall time. +const fixedExpireMs uint64 = 1788_998_400_000 + +func newRedisDB(t *testing.T) (*RedisDB, string) { + t.Helper() + root := t.TempDir() + return NewRedisDB(root), root +} + +func encodeNewStringValue(t *testing.T, value []byte, expireAtMs uint64) []byte { + t.Helper() + flags := byte(0) + header := []byte{redisStrMagic, redisStrVersion, flags} + body := value + if expireAtMs > 0 { + flags = redisStrHasTTL + header[2] = flags + var ttl [redisUint64Bytes]byte + binary.BigEndian.PutUint64(ttl[:], expireAtMs) + header = append(header, ttl[:]...) + } + return append(header, body...) +} + +func encodeTTLValue(expireAtMs uint64) []byte { + var b [redisUint64Bytes]byte + binary.BigEndian.PutUint64(b[:], expireAtMs) + return b[:] +} + +func readBlob(t *testing.T, path string) []byte { + t.Helper() + b, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + return b +} + +type ttlRecord struct { + Key string `json:"key"` + ExpireAtMs uint64 `json:"expire_at_ms"` +} + +func readTTLJSONL(t *testing.T, path string) []ttlRecord { + t.Helper() + f, err := os.Open(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("open %s: %v", path, err) + } + defer f.Close() + var out []ttlRecord + sc := bufio.NewScanner(f) + for sc.Scan() { + var r ttlRecord + if err := json.Unmarshal(sc.Bytes(), &r); err != nil { + t.Fatalf("unmarshal %q: %v", sc.Text(), err) + } + out = append(out, r) + } + if err := sc.Err(); err != nil { + t.Fatalf("scan: %v", err) + } + return out +} + +func TestRedisDB_HandleString_NewFormatNoTTL(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + val := encodeNewStringValue(t, []byte("hello"), 0) + if err := db.HandleString([]byte("greeting"), val); err != nil { + t.Fatalf("HandleString: %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + body := readBlob(t, filepath.Join(root, "redis", "db_0", "strings", "greeting.bin")) + if string(body) != "hello" { + t.Fatalf("blob = %q want %q", body, "hello") + } + // No TTL → strings_ttl.jsonl must not exist (omit empty). + if _, err := os.Stat(filepath.Join(root, "redis", "db_0", "strings_ttl.jsonl")); !os.IsNotExist(err) { + t.Fatalf("expected no strings_ttl.jsonl, stat err=%v", err) + } +} + +func TestRedisDB_HandleString_NewFormatWithInlineTTL(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + val := encodeNewStringValue(t, []byte("expiring"), fixedExpireMs) + if err := db.HandleString([]byte("session:abc"), val); err != nil { + t.Fatalf("HandleString: %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + body := readBlob(t, filepath.Join(root, "redis", "db_0", "strings", "session%3Aabc.bin")) + if string(body) != "expiring" { + t.Fatalf("blob = %q want %q", body, "expiring") + } + recs := readTTLJSONL(t, filepath.Join(root, "redis", "db_0", "strings_ttl.jsonl")) + if len(recs) != 1 { + t.Fatalf("ttl records = %d, want 1", len(recs)) + } + if recs[0].Key != "session%3Aabc" { + t.Fatalf("ttl key = %q", recs[0].Key) + } + if recs[0].ExpireAtMs != fixedExpireMs { + t.Fatalf("ttl ms = %d want %d", recs[0].ExpireAtMs, fixedExpireMs) + } +} + +func TestRedisDB_HandleString_LegacyFormatTreatedAsRawValue(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // Legacy (no magic prefix): bytes are the user value verbatim. + if err := db.HandleString([]byte("legacy"), []byte("\x00\xff\x01raw")); err != nil { + t.Fatalf("HandleString: %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + body := readBlob(t, filepath.Join(root, "redis", "db_0", "strings", "legacy.bin")) + if string(body) != "\x00\xff\x01raw" { + t.Fatalf("blob bytes = %x", body) + } +} + +func TestRedisDB_HandleHLL_WritesRawSketch(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + sketch := []byte{0xde, 0xad, 0xbe, 0xef} + if err := db.HandleHLL([]byte("uniques"), sketch); err != nil { + t.Fatalf("HandleHLL: %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + body := readBlob(t, filepath.Join(root, "redis", "db_0", "hll", "uniques.bin")) + if string(body) != string(sketch) { + t.Fatalf("hll blob = %x want %x", body, sketch) + } +} + +func assertTTLSidecar(t *testing.T, path string, wantKey string, wantMs uint64) { + t.Helper() + recs := readTTLJSONL(t, path) + if len(recs) != 1 { + t.Fatalf("%s: %d records, want 1", path, len(recs)) + } + if recs[0].Key != wantKey { + t.Fatalf("%s: key %q want %q", path, recs[0].Key, wantKey) + } + if recs[0].ExpireAtMs != wantMs { + t.Fatalf("%s: ms %d want %d", path, recs[0].ExpireAtMs, wantMs) + } +} + +func TestRedisDB_HandleTTL_RoutesByPriorTypeObservation(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + mustNoErr := func(err error) { + t.Helper() + if err != nil { + t.Fatal(err) + } + } + // HLL key first, then string key, then TTL records (lex order in + // snapshot: hll < str < ttl). + mustNoErr(db.HandleHLL([]byte("hll-key"), []byte{0x01})) + mustNoErr(db.HandleString([]byte("legacy-str"), []byte("legacy-raw"))) + mustNoErr(db.HandleTTL([]byte("hll-key"), encodeTTLValue(fixedExpireMs))) + mustNoErr(db.HandleTTL([]byte("legacy-str"), encodeTTLValue(fixedExpireMs+1))) + mustNoErr(db.Finalize()) + + assertTTLSidecar(t, filepath.Join(root, "redis", "db_0", "hll_ttl.jsonl"), "hll-key", fixedExpireMs) + assertTTLSidecar(t, filepath.Join(root, "redis", "db_0", "strings_ttl.jsonl"), "legacy-str", fixedExpireMs+1) +} + +func TestRedisDB_HandleTTL_OrphanWarnsOnFinalize(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + var events []string + db.WithWarnSink(func(event string, fields ...any) { + events = append(events, event) + }) + // TTL for a key never claimed by HandleString or HandleHLL — likely + // belongs to a wide-column type (hash/list/set/zset/stream) whose + // encoder has not landed yet. Must not crash. + if err := db.HandleTTL([]byte("orphan"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + if len(events) != 1 || events[0] != "redis_orphan_ttl" { + t.Fatalf("events = %v want [redis_orphan_ttl]", events) + } +} + +func TestRedisDB_RejectsTruncatedNewFormat(t *testing.T) { + t.Parallel() + cases := [][]byte{ + {redisStrMagic, redisStrVersion}, // header truncated (missing flags) + {redisStrMagic, redisStrVersion, redisStrHasTTL, 0x00, 0x00, 0x00}, // ttl section truncated + } + for _, raw := range cases { + db, _ := newRedisDB(t) + err := db.HandleString([]byte("k"), raw) + if !errors.Is(err, ErrRedisInvalidStringValue) { + t.Fatalf("err=%v want ErrRedisInvalidStringValue (input %x)", err, raw) + } + } +} + +func TestRedisDB_HandleTTL_RejectsBadLength(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + err := db.HandleTTL([]byte("k"), []byte{0x01, 0x02}) + if !errors.Is(err, ErrRedisInvalidTTLValue) { + t.Fatalf("err=%v want ErrRedisInvalidTTLValue", err) + } +} + +func TestRedisDB_FilenamesGoThroughEncodeSegment(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // User key with reserved bytes. Filename encoding must match the + // EncodeSegment contract (verified by filename_test.go). + if err := db.HandleString([]byte("a/b:c"), encodeNewStringValue(t, []byte("v"), 0)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + want := filepath.Join(root, "redis", "db_0", "strings", "a%2Fb%3Ac.bin") + if _, err := os.Stat(want); err != nil { + t.Fatalf("expected file %s, stat err=%v", want, err) + } +} + +func TestRedisDB_AtomicWriteRefusesSymlinkOverwrite(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + dir := filepath.Join(root, "redis", "db_0", "strings") + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + target := filepath.Join(dir, "victim.bin") + bait := filepath.Join(root, "bait") + if err := os.WriteFile(bait, []byte("stay-out"), 0o600); err != nil { + t.Fatal(err) + } + if err := os.Symlink(bait, target); err != nil { + t.Fatal(err) + } + err := db.HandleString([]byte("victim"), encodeNewStringValue(t, []byte("attack"), 0)) + if err == nil || !strings.Contains(err.Error(), "refusing to overwrite symlink") { + t.Fatalf("expected symlink-refusal error, got %v", err) + } + // Bait file must be untouched. + if got, _ := os.ReadFile(bait); string(got) != "stay-out" { //nolint:gosec // test path + t.Fatalf("bait file written through symlink: %q", got) + } +} + +func TestRedisDB_HasInlineTTL(t *testing.T) { + t.Parallel() + if !HasInlineTTL(encodeNewStringValue(t, []byte("v"), fixedExpireMs)) { + t.Fatalf("HasInlineTTL = false on inline-TTL value") + } + if HasInlineTTL(encodeNewStringValue(t, []byte("v"), 0)) { + t.Fatalf("HasInlineTTL = true on no-TTL value") + } + if HasInlineTTL([]byte("legacy-raw")) { + t.Fatalf("HasInlineTTL = true on legacy value") + } +} diff --git a/internal/backup/sqs.go b/internal/backup/sqs.go new file mode 100644 index 00000000..de31d7fa --- /dev/null +++ b/internal/backup/sqs.go @@ -0,0 +1,595 @@ +package backup + +import ( + "bytes" + "encoding/base64" + "encoding/binary" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/cockroachdb/errors" +) + +// Snapshot key prefixes the SQS encoder dispatches on. Kept in sync with +// adapter/sqs_keys.go and adapter/sqs_messages.go (see SqsQueueMetaPrefix / +// SqsMsgDataPrefix); a renamed prefix in the live code is caught here at +// dispatch time by the corresponding tests that synthesise records with +// these literal byte strings. +const ( + SQSQueueMetaPrefix = "!sqs|queue|meta|" + SQSQueueGenPrefix = "!sqs|queue|gen|" + SQSQueueSeqPrefix = "!sqs|queue|seq|" + SQSQueueTombstonePrefix = "!sqs|queue|tombstone|" + SQSMsgDataPrefix = "!sqs|msg|data|" + SQSMsgVisPrefix = "!sqs|msg|vis|" + SQSMsgByAgePrefix = "!sqs|msg|byage|" + SQSMsgDedupPrefix = "!sqs|msg|dedup|" + SQSMsgGroupPrefix = "!sqs|msg|group|" +) + +// Stored value magic prefixes (mirrors adapter/sqs_catalog.go and +// adapter/sqs_messages.go). Values that don't carry the right magic are +// rejected — they are either from a future schema version or genuinely +// corrupt, both of which warrant aborting rather than silently emitting +// garbage. +var ( + storedSQSMetaMagic = []byte{0x00, 'S', 'Q', 0x01} + storedSQSMsgMagic = []byte{0x00, 'S', 'M', 0x01} +) + +// genBytes is the fixed width of the BE uint64 generation field in +// !sqs|msg|data| keys. +const genBytes = 8 + +// ErrSQSInvalidQueueMeta is returned for !sqs|queue|meta values that miss +// the magic prefix or fail JSON decoding. +var ErrSQSInvalidQueueMeta = errors.New("backup: invalid !sqs|queue|meta value") + +// ErrSQSInvalidMessage is returned for !sqs|msg|data values that miss the +// magic prefix or fail JSON decoding. +var ErrSQSInvalidMessage = errors.New("backup: invalid !sqs|msg|data value") + +// ErrSQSMalformedKey is returned when an SQS key cannot be parsed for the +// queue-name segment (e.g., the heuristic boundary detection found no +// transition byte). +var ErrSQSMalformedKey = errors.New("backup: malformed SQS key") + +// SQSEncoder encodes the SQS prefix family into the per-queue layout +// described in docs/design/2026_04_29_proposed_snapshot_logical_decoder.md +// (Phase 0): one `_queue.json` per queue and one ordered `messages.jsonl`. +// +// Lifecycle: per-snapshot pass calls Handle* for each record, then exactly +// one Finalize. Side-records (vis/byage/dedup/group/tombstone) are +// excluded by default; opt in with WithIncludeSideRecords. Visibility +// state on emitted messages is zeroed by default; opt in to preserve with +// WithPreserveVisibility. +// +// The encoder buffers messages per queue in memory and sorts them at +// Finalize-time by (SendTimestampMillis, SequenceNumber, MessageID). This +// is acceptable for typical operational queues; queues with hundreds of +// millions of messages will need a future stream-and-merge variant. +type SQSEncoder struct { + outRoot string + includeSideRecords bool + preserveVisibility bool + + // queues is keyed by the base64url-encoded queue name (the on-disk + // segment in the !sqs|queue|meta| key). Pending messages are + // keyed the same way so meta records arriving later (lex 'q' > 'm') + // can resolve them. + queues map[string]*sqsQueueState + + warn func(event string, fields ...any) +} + +type sqsQueueState struct { + encoded string // base64url segment from the meta key + name string // decoded queue name; populated on meta arrival + meta *sqsQueueMetaPublic + messages []sqsMessageRecord + // internalBuf accumulates side records in their on-disk shape if + // includeSideRecords is on. Each line is the encoded prefix + + // hex(rest-of-key) + value (b64) — implementation-grade detail + // landing in a follow-up PR; for now this PR keeps it as a bag. + internalBuf []sqsInternalRecord +} + +type sqsInternalRecord struct { + Prefix string `json:"prefix"` + KeyHex string `json:"key_hex"` + ValueB64 string `json:"value_b64"` +} + +// sqsQueueMetaPublic is the dump-format projection of the live +// adapter/sqs_catalog.go sqsQueueMeta. Field names match the AWS-style +// vocabulary an external restore tool would use. +type sqsQueueMetaPublic struct { + FormatVersion uint32 `json:"format_version"` + Name string `json:"name"` + FIFO bool `json:"fifo,omitempty"` + ContentBasedDeduplication bool `json:"content_based_deduplication,omitempty"` + VisibilityTimeoutSeconds int64 `json:"visibility_timeout_seconds"` + MessageRetentionSeconds int64 `json:"message_retention_seconds"` + DelaySeconds int64 `json:"delay_seconds"` + ReceiveMessageWaitSeconds int64 `json:"receive_message_wait_seconds,omitempty"` + MaximumMessageSize int64 `json:"maximum_message_size,omitempty"` + RedrivePolicy string `json:"redrive_policy,omitempty"` +} + +// sqsMessageRecord is the dump-format projection. Mirrors the live +// adapter/sqs_messages.go:80 record one-to-one — JSON tag names match so +// a restorer can call SendMessage with each line as the input. Visibility +// state is included in the schema so --preserve-visibility consumers can +// round-trip; the encoder zeroes the visibility-state fields by default. +type sqsMessageRecord struct { + MessageID string `json:"message_id"` + Body []byte `json:"body"` + MD5OfBody string `json:"md5_of_body,omitempty"` + MD5OfMessageAttributes string `json:"md5_of_message_attributes,omitempty"` + MessageAttributes map[string]json.RawMessage `json:"message_attributes,omitempty"` + SenderID string `json:"sender_id,omitempty"` + SendTimestampMillis int64 `json:"send_timestamp_millis"` + AvailableAtMillis int64 `json:"available_at_millis"` + VisibleAtMillis int64 `json:"visible_at_millis"` + ReceiveCount int64 `json:"receive_count"` + FirstReceiveMillis int64 `json:"first_receive_millis,omitempty"` + CurrentReceiptToken []byte `json:"current_receipt_token,omitempty"` + QueueGeneration uint64 `json:"queue_generation"` + MessageGroupID string `json:"message_group_id,omitempty"` + MessageDedupID string `json:"message_deduplication_id,omitempty"` + SequenceNumber uint64 `json:"sequence_number,omitempty"` + DeadLetterSourceArn string `json:"dead_letter_source_arn,omitempty"` +} + +// NewSQSEncoder constructs an encoder rooted at /sqs/. +func NewSQSEncoder(outRoot string) *SQSEncoder { + return &SQSEncoder{ + outRoot: outRoot, + queues: make(map[string]*sqsQueueState), + } +} + +// WithIncludeSideRecords routes vis/byage/dedup/group/tombstone records +// into _internals/. Default is to exclude them — they are derivable from +// the queue config + message records and replaying them on restore can +// resurrect aborted state. +func (s *SQSEncoder) WithIncludeSideRecords(on bool) *SQSEncoder { + s.includeSideRecords = on + return s +} + +// WithPreserveVisibility passes the visibility-state fields +// (visible_at_millis, current_receipt_token, receive_count, +// first_receive_millis) through to the dump. Default is to zero them so +// the restored queue starts with every message visible. +func (s *SQSEncoder) WithPreserveVisibility(on bool) *SQSEncoder { + s.preserveVisibility = on + return s +} + +// WithWarnSink wires a structured warning hook (same shape as +// RedisDB.WithWarnSink). Used for orphan messages and unresolvable side +// records. +func (s *SQSEncoder) WithWarnSink(fn func(event string, fields ...any)) *SQSEncoder { + s.warn = fn + return s +} + +// HandleQueueMeta processes one !sqs|queue|meta| record. Strips +// the magic prefix, decodes the JSON, projects to the dump-format +// sqsQueueMetaPublic, and parks it on the per-queue state. +func (s *SQSEncoder) HandleQueueMeta(key, value []byte) error { + encoded, err := stripPrefixSegment(key, []byte(SQSQueueMetaPrefix)) + if err != nil { + return err + } + name, err := base64.RawURLEncoding.DecodeString(encoded) + if err != nil { + return errors.Wrap(ErrSQSMalformedKey, err.Error()) + } + meta, err := decodeSQSQueueMetaValue(value) + if err != nil { + return err + } + st := s.queueState(encoded) + st.name = string(name) + st.meta = meta + // The live record carries Name internally; surface it explicitly so + // the dump's _queue.json is self-describing. + if meta.Name == "" { + meta.Name = st.name + } + return nil +} + +// HandleMessageData processes one !sqs|msg|data| +// record. The encoded queue segment is parsed out of the key and used as +// the per-queue routing key; the message is buffered until Finalize so it +// can be sorted and emitted in send-order. +func (s *SQSEncoder) HandleMessageData(key, value []byte) error { + encQueue, err := parseSQSMessageDataKey(key) + if err != nil { + return err + } + rec, err := decodeSQSMessageValue(value) + if err != nil { + return err + } + if !s.preserveVisibility { + rec.VisibleAtMillis = 0 + rec.CurrentReceiptToken = nil + rec.ReceiveCount = 0 + rec.FirstReceiveMillis = 0 + } + st := s.queueState(encQueue) + st.messages = append(st.messages, rec) + return nil +} + +// HandleSideRecord buffers (vis|byage|dedup|group|tombstone) records when +// includeSideRecords is on; otherwise drops them silently (this is the +// documented Phase 0 default). +func (s *SQSEncoder) HandleSideRecord(prefix string, key, value []byte) error { + if !s.includeSideRecords { + return nil + } + encQueue, err := parseSQSGenericKey(key, prefix) + if err != nil { + // Tombstones include a fixed-width gen but no msg ID; the + // generic parser tolerates the empty trailer. + return err + } + st := s.queueState(encQueue) + st.internalBuf = append(st.internalBuf, sqsInternalRecord{ + Prefix: prefix, + KeyHex: fmt.Sprintf("%x", key), + ValueB64: base64.RawURLEncoding.EncodeToString(value), + }) + return nil +} + +// Finalize flushes every queue's _queue.json and messages.jsonl. Queues +// with buffered messages but no meta record (orphans) emit a warning and +// are skipped — restoring orphan messages without a queue config would +// silently create a queue with default settings, which is rarely what +// the operator wants. +func (s *SQSEncoder) Finalize() error { + var firstErr error + for _, st := range s.queues { + if st.meta == nil { + s.emitWarn("sqs_orphan_messages", + "encoded_queue", st.encoded, + "buffered_messages", len(st.messages), + "hint", "no !sqs|queue|meta record matched this encoded prefix; messages dropped from the dump") + continue + } + if err := s.flushQueue(st); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func (s *SQSEncoder) flushQueue(st *sqsQueueState) error { + dir := filepath.Join(s.outRoot, "sqs", EncodeSegment([]byte(st.name))) + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + if err := writeFileAtomic(filepath.Join(dir, "_queue.json"), mustMarshalIndent(st.meta)); err != nil { + return err + } + if len(st.messages) == 0 { + return nil + } + sortMessagesForEmit(st.messages) + jl, err := openJSONL(filepath.Join(dir, "messages.jsonl")) + if err != nil { + return err + } + for i := range st.messages { + if err := jl.enc.Encode(st.messages[i]); err != nil { + _ = closeJSONL(jl) + return errors.WithStack(err) + } + } + if err := closeJSONL(jl); err != nil { + return err + } + if len(st.internalBuf) > 0 { + if err := s.flushInternals(dir, st.internalBuf); err != nil { + return err + } + } + return nil +} + +func (s *SQSEncoder) flushInternals(queueDir string, recs []sqsInternalRecord) error { + dir := filepath.Join(queueDir, "_internals") + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:mnd // 0755 == standard dir mode + return errors.WithStack(err) + } + jl, err := openJSONL(filepath.Join(dir, "side_records.jsonl")) + if err != nil { + return err + } + for i := range recs { + if err := jl.enc.Encode(recs[i]); err != nil { + _ = closeJSONL(jl) + return errors.WithStack(err) + } + } + return closeJSONL(jl) +} + +func (s *SQSEncoder) emitWarn(event string, fields ...any) { + if s.warn == nil { + return + } + s.warn(event, fields...) +} + +func (s *SQSEncoder) queueState(encoded string) *sqsQueueState { + if st, ok := s.queues[encoded]; ok { + return st + } + st := &sqsQueueState{encoded: encoded} + s.queues[encoded] = st + return st +} + +// stripPrefixSegment returns the trailing string after a literal prefix. +// It does NOT decode the segment — the caller decides whether base64url +// or raw bytes are appropriate for the prefix family. +func stripPrefixSegment(key, prefix []byte) (string, error) { + if !bytes.HasPrefix(key, prefix) { + return "", errors.Wrapf(ErrSQSMalformedKey, "key does not start with %q", prefix) + } + return string(key[len(prefix):]), nil +} + +// parseSQSMessageDataKey peels !sqs|msg|data| +// and returns encQueue. The gen and msgID are not surfaced because the +// dump format pulls QueueGeneration / MessageID out of the value record. +// +// Boundary detection: encQueue is base64url-no-padding, alphabet +// [A-Za-z0-9-_]. The gen is 8 raw bytes. For any production gen value +// (< 2^56), the first byte is 0x00, which is not in the base64url +// alphabet, so the first non-alphabet byte is the gen-start. We document +// this assumption rather than build a more elaborate prober — gens +// approaching 2^56 would have already wrapped many other invariants. +func parseSQSMessageDataKey(key []byte) (string, error) { + rest, err := stripPrefixSegment(key, []byte(SQSMsgDataPrefix)) + if err != nil { + return "", err + } + idx := scanBase64URLBoundary(rest) + if idx == 0 || idx+genBytes > len(rest) { + return "", errors.Wrapf(ErrSQSMalformedKey, + "queue segment boundary not found in %q", key) + } + encQueue := rest[:idx] + if _, err := base64.RawURLEncoding.DecodeString(encQueue); err != nil { + return "", errors.Wrap(ErrSQSMalformedKey, err.Error()) + } + // Validate the msg-id segment decodes too; if it doesn't, the + // boundary detection got it wrong and we surface an error rather + // than emit a record under a wrong queue. + encMsgID := rest[idx+genBytes:] + if _, err := base64.RawURLEncoding.DecodeString(encMsgID); err != nil { + return "", errors.Wrap(ErrSQSMalformedKey, err.Error()) + } + return encQueue, nil +} + +// parseSQSGenericKey is a coarse parser for the side-record prefixes +// (vis/byage/dedup/group/tombstone). Callers in this PR only need to +// know the encoded queue segment for routing; full structural parsing +// of side-record keys is deferred until Phase 0a's reaper-aware mode +// lands. +func parseSQSGenericKey(key []byte, prefix string) (string, error) { + rest, err := stripPrefixSegment(key, []byte(prefix)) + if err != nil { + return "", err + } + idx := scanBase64URLBoundary(rest) + if idx == 0 { + return "", errors.Wrapf(ErrSQSMalformedKey, + "queue segment not found after prefix %q", prefix) + } + return rest[:idx], nil +} + +// scanBase64URLBoundary returns the index of the first byte in s that is +// NOT in the base64url alphabet [A-Za-z0-9-_]. Returns len(s) if every +// byte is alphabet. +func scanBase64URLBoundary(s string) int { + for i := 0; i < len(s); i++ { + if !isBase64URLByte(s[i]) { + return i + } + } + return len(s) +} + +func isBase64URLByte(c byte) bool { + switch { + case c >= 'A' && c <= 'Z': + return true + case c >= 'a' && c <= 'z': + return true + case c >= '0' && c <= '9': + return true + case c == '-', c == '_': + return true + } + return false +} + +// decodeSQSQueueMetaValue strips the SQ magic prefix, JSON-decodes the +// live sqsQueueMeta, and projects to the dump-format +// sqsQueueMetaPublic. Unknown fields in the live record are tolerated +// (forward-compat with new live-side fields the dump format hasn't +// learned yet). +func decodeSQSQueueMetaValue(value []byte) (*sqsQueueMetaPublic, error) { + if !bytes.HasPrefix(value, storedSQSMetaMagic) { + return nil, errors.Wrap(ErrSQSInvalidQueueMeta, "missing magic prefix") + } + body := value[len(storedSQSMetaMagic):] + var live struct { + Name string `json:"name"` + IsFIFO bool `json:"is_fifo"` + ContentBasedDedup bool `json:"content_based_dedup"` + VisibilityTimeoutSeconds int64 `json:"visibility_timeout_seconds"` + MessageRetentionSeconds int64 `json:"message_retention_seconds"` + DelaySeconds int64 `json:"delay_seconds"` + ReceiveMessageWaitSeconds int64 `json:"receive_message_wait_seconds"` + MaximumMessageSize int64 `json:"maximum_message_size"` + RedrivePolicy string `json:"redrive_policy"` + } + if err := json.Unmarshal(body, &live); err != nil { + return nil, errors.Wrap(ErrSQSInvalidQueueMeta, err.Error()) + } + return &sqsQueueMetaPublic{ + FormatVersion: 1, + Name: live.Name, + FIFO: live.IsFIFO, + ContentBasedDeduplication: live.ContentBasedDedup, + VisibilityTimeoutSeconds: live.VisibilityTimeoutSeconds, + MessageRetentionSeconds: live.MessageRetentionSeconds, + DelaySeconds: live.DelaySeconds, + ReceiveMessageWaitSeconds: live.ReceiveMessageWaitSeconds, + MaximumMessageSize: live.MaximumMessageSize, + RedrivePolicy: live.RedrivePolicy, + }, nil +} + +// decodeSQSMessageValue strips the SM magic prefix and JSON-decodes the +// live sqsMessageRecord into the dump-format projection. Unlike the +// queue-meta path, every documented field is preserved (the dump format +// is the public projection, so there is nothing to filter). +func decodeSQSMessageValue(value []byte) (sqsMessageRecord, error) { + if !bytes.HasPrefix(value, storedSQSMsgMagic) { + return sqsMessageRecord{}, errors.Wrap(ErrSQSInvalidMessage, "missing magic prefix") + } + body := value[len(storedSQSMsgMagic):] + // The live record uses different JSON tag names for the fields we + // expose under AWS-style names (message_group_id, sequence_number, + // etc.). Unmarshal into a shape that mirrors the live tags, then + // translate. + var live struct { + MessageID string `json:"message_id"` + Body []byte `json:"body"` + MD5OfBody string `json:"md5_of_body"` + MD5OfMessageAttributes string `json:"md5_of_message_attributes"` + MessageAttributes map[string]json.RawMessage `json:"message_attributes"` + SenderID string `json:"sender_id"` + SendTimestampMillis int64 `json:"send_timestamp_millis"` + AvailableAtMillis int64 `json:"available_at_millis"` + VisibleAtMillis int64 `json:"visible_at_millis"` + ReceiveCount int64 `json:"receive_count"` + FirstReceiveMillis int64 `json:"first_receive_millis"` + CurrentReceiptToken []byte `json:"current_receipt_token"` + QueueGeneration uint64 `json:"queue_generation"` + MessageGroupID string `json:"message_group_id"` + MessageDedupID string `json:"message_deduplication_id"` + SequenceNumber uint64 `json:"sequence_number"` + DeadLetterSourceArn string `json:"dead_letter_source_arn"` + } + if err := json.Unmarshal(body, &live); err != nil { + return sqsMessageRecord{}, errors.Wrap(ErrSQSInvalidMessage, err.Error()) + } + return sqsMessageRecord{ + MessageID: live.MessageID, + Body: live.Body, + MD5OfBody: live.MD5OfBody, + MD5OfMessageAttributes: live.MD5OfMessageAttributes, + MessageAttributes: live.MessageAttributes, + SenderID: live.SenderID, + SendTimestampMillis: live.SendTimestampMillis, + AvailableAtMillis: live.AvailableAtMillis, + VisibleAtMillis: live.VisibleAtMillis, + ReceiveCount: live.ReceiveCount, + FirstReceiveMillis: live.FirstReceiveMillis, + CurrentReceiptToken: live.CurrentReceiptToken, + QueueGeneration: live.QueueGeneration, + MessageGroupID: live.MessageGroupID, + MessageDedupID: live.MessageDedupID, + SequenceNumber: live.SequenceNumber, + DeadLetterSourceArn: live.DeadLetterSourceArn, + }, nil +} + +func sortMessagesForEmit(msgs []sqsMessageRecord) { + sort.SliceStable(msgs, func(i, j int) bool { + a, b := msgs[i], msgs[j] + switch { + case a.SendTimestampMillis != b.SendTimestampMillis: + return a.SendTimestampMillis < b.SendTimestampMillis + case a.SequenceNumber != b.SequenceNumber: + return a.SequenceNumber < b.SequenceNumber + default: + return a.MessageID < b.MessageID + } + }) +} + +func mustMarshalIndent(v any) []byte { + out, err := json.MarshalIndent(v, "", " ") //nolint:mnd // 2-space indent matches MANIFEST + if err != nil { + // MarshalIndent only fails on unsupported types; sqsQueueMetaPublic + // is a plain struct of primitives. A panic here is a programmer + // error rather than a runtime condition we should plan to handle. + panic(err) + } + return out +} + +// keyComponents is a debugging helper exposed for tests; not used by +// production code paths. +type keyComponents struct { + Prefix string + Encoded string + GenRaw []byte + MsgID string +} + +// peekMsgDataKey is a test/debug helper that returns the structural +// components of a !sqs|msg|data key. Production code uses +// parseSQSMessageDataKey directly because it never needs gen or msgID. +func peekMsgDataKey(key []byte) (keyComponents, error) { + rest, err := stripPrefixSegment(key, []byte(SQSMsgDataPrefix)) + if err != nil { + return keyComponents{}, err + } + idx := scanBase64URLBoundary(rest) + if idx == 0 || idx+genBytes > len(rest) { + return keyComponents{}, errors.Wrap(ErrSQSMalformedKey, "boundary not found") + } + return keyComponents{ + Prefix: SQSMsgDataPrefix, + Encoded: rest[:idx], + GenRaw: []byte(rest[idx : idx+genBytes]), + MsgID: rest[idx+genBytes:], + }, nil +} + +// EncodeMsgDataKey constructs a !sqs|msg|data key for tests. Mirrors the +// live sqsMsgDataKey constructor in adapter/sqs_messages.go. +func EncodeMsgDataKey(queueName string, gen uint64, messageID string) []byte { + out := make([]byte, 0, len(SQSMsgDataPrefix)+64) //nolint:mnd // 64 == sqsKeyCapLarge + out = append(out, SQSMsgDataPrefix...) + out = append(out, base64.RawURLEncoding.EncodeToString([]byte(queueName))...) + var b [genBytes]byte + binary.BigEndian.PutUint64(b[:], gen) + out = append(out, b[:]...) + out = append(out, base64.RawURLEncoding.EncodeToString([]byte(messageID))...) + return out +} + +// EncodeQueueMetaKey constructs a !sqs|queue|meta key for tests. +func EncodeQueueMetaKey(queueName string) []byte { + return []byte(SQSQueueMetaPrefix + base64.RawURLEncoding.EncodeToString([]byte(queueName))) +} diff --git a/internal/backup/sqs_test.go b/internal/backup/sqs_test.go new file mode 100644 index 00000000..9307c40a --- /dev/null +++ b/internal/backup/sqs_test.go @@ -0,0 +1,384 @@ +package backup + +import ( + "bufio" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/cockroachdb/errors" +) + +func newSQSEncoder(t *testing.T) (*SQSEncoder, string) { + t.Helper() + root := t.TempDir() + return NewSQSEncoder(root), root +} + +func encodeQueueMetaValue(t *testing.T, m map[string]any) []byte { + t.Helper() + body, err := json.Marshal(m) + if err != nil { + t.Fatalf("marshal: %v", err) + } + out := append([]byte{}, storedSQSMetaMagic...) + return append(out, body...) +} + +func encodeMessageValue(t *testing.T, m map[string]any) []byte { + t.Helper() + body, err := json.Marshal(m) + if err != nil { + t.Fatalf("marshal: %v", err) + } + out := append([]byte{}, storedSQSMsgMagic...) + return append(out, body...) +} + +func readQueueJSON(t *testing.T, path string) map[string]any { + t.Helper() + b, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + var m map[string]any + if err := json.Unmarshal(b, &m); err != nil { + t.Fatalf("unmarshal: %v", err) + } + return m +} + +// floatField fetches a JSON-decoded numeric field as float64, asserting it +// exists. JSON unmarshals all numbers to float64 by default; this wrapper +// keeps the type-assertion in one place and gives lint a structured failure +// to point at if the assumption breaks. +func floatField(t *testing.T, m map[string]any, key string) float64 { + t.Helper() + v, ok := m[key] + if !ok { + t.Fatalf("field %q missing in %+v", key, m) + } + f, ok := v.(float64) + if !ok { + t.Fatalf("field %q = %T(%v), want float64", key, v, v) + } + return f +} + +func readMessagesJSONL(t *testing.T, path string) []map[string]any { + t.Helper() + f, err := os.Open(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("open %s: %v", path, err) + } + defer f.Close() + var out []map[string]any + sc := bufio.NewScanner(f) + for sc.Scan() { + var rec map[string]any + if err := json.Unmarshal(sc.Bytes(), &rec); err != nil { + t.Fatalf("unmarshal %q: %v", sc.Text(), err) + } + out = append(out, rec) + } + if err := sc.Err(); err != nil { + t.Fatalf("scan: %v", err) + } + return out +} + +func TestSQS_QueueMetaRoundTrip(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + key := EncodeQueueMetaKey("orders-fifo.fifo") + val := encodeQueueMetaValue(t, map[string]any{ + "name": "orders-fifo.fifo", + "is_fifo": true, + "content_based_dedup": false, + "visibility_timeout_seconds": 30, + "message_retention_seconds": 345600, + "delay_seconds": 0, + "receive_message_wait_seconds": 0, + "maximum_message_size": 262144, + }) + if err := enc.HandleQueueMeta(key, val); err != nil { + t.Fatalf("HandleQueueMeta: %v", err) + } + if err := enc.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + got := readQueueJSON(t, filepath.Join(root, "sqs", "orders-fifo.fifo", "_queue.json")) + if got["name"] != "orders-fifo.fifo" { + t.Fatalf("name = %v", got["name"]) + } + if got["fifo"] != true { + t.Fatalf("fifo = %v", got["fifo"]) + } + if floatField(t, got, "visibility_timeout_seconds") != 30 { + t.Fatalf("visibility_timeout_seconds = %v", got["visibility_timeout_seconds"]) + } + if floatField(t, got, "format_version") != 1 { + t.Fatalf("format_version = %v", got["format_version"]) + } +} + +func TestSQS_MessagesSortedByTimestampSeqMessageID(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + queue := "orders-fifo.fifo" + if err := enc.HandleQueueMeta(EncodeQueueMetaKey(queue), encodeQueueMetaValue(t, map[string]any{ + "name": queue, "is_fifo": true, "visibility_timeout_seconds": 30, "message_retention_seconds": 60, + })); err != nil { + t.Fatal(err) + } + // Insert in scrambled order; after Finalize the JSONL must be sorted + // by (send_ts, seq, msg_id). + send := func(msgID string, sendMs int64, seq uint64, body string) { + t.Helper() + key := EncodeMsgDataKey(queue, 7, msgID) + val := encodeMessageValue(t, map[string]any{ + "message_id": msgID, + "body": []byte(body), + "send_timestamp_millis": sendMs, + "available_at_millis": sendMs, + "queue_generation": 7, + "sequence_number": seq, + }) + if err := enc.HandleMessageData(key, val); err != nil { + t.Fatalf("HandleMessageData: %v", err) + } + } + send("msg-c", 100, 3, "c") + send("msg-a", 90, 1, "a") + send("msg-b", 100, 2, "b") + send("msg-tieA", 200, 5, "tA") + send("msg-tieB", 200, 5, "tB") + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + recs := readMessagesJSONL(t, filepath.Join(root, "sqs", queue, "messages.jsonl")) + wantOrder := []string{"msg-a", "msg-b", "msg-c", "msg-tieA", "msg-tieB"} + if len(recs) != len(wantOrder) { + t.Fatalf("len = %d want %d", len(recs), len(wantOrder)) + } + for i, w := range wantOrder { + if recs[i]["message_id"] != w { + t.Fatalf("recs[%d].message_id = %v, want %v", i, recs[i]["message_id"], w) + } + } +} + +func TestSQS_DefaultZeroesVisibilityState(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + queue := "q" + if err := enc.HandleQueueMeta(EncodeQueueMetaKey(queue), encodeQueueMetaValue(t, map[string]any{ + "name": queue, "visibility_timeout_seconds": 30, "message_retention_seconds": 60, + })); err != nil { + t.Fatal(err) + } + val := encodeMessageValue(t, map[string]any{ + "message_id": "m1", + "body": []byte("payload"), + "send_timestamp_millis": 1000, + "queue_generation": 1, + "visible_at_millis": 5000, // populated mid-flight + "current_receipt_token": []byte{0x01, 0x02}, + "receive_count": 3, + "first_receive_millis": 4500, + }) + if err := enc.HandleMessageData(EncodeMsgDataKey(queue, 1, "m1"), val); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + recs := readMessagesJSONL(t, filepath.Join(root, "sqs", queue, "messages.jsonl")) + if len(recs) != 1 { + t.Fatalf("recs = %d", len(recs)) + } + r := recs[0] + if floatField(t, r, "visible_at_millis") != 0 { + t.Fatalf("visible_at_millis = %v want 0", r["visible_at_millis"]) + } + if _, present := r["current_receipt_token"]; present { + t.Fatalf("current_receipt_token must be omitted on default zeroing, got %v", r["current_receipt_token"]) + } + if floatField(t, r, "receive_count") != 0 { + t.Fatalf("receive_count = %v want 0", r["receive_count"]) + } +} + +func TestSQS_PreserveVisibilityKeepsLiveFields(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + enc.WithPreserveVisibility(true) + queue := "q" + if err := enc.HandleQueueMeta(EncodeQueueMetaKey(queue), encodeQueueMetaValue(t, map[string]any{ + "name": queue, "visibility_timeout_seconds": 30, "message_retention_seconds": 60, + })); err != nil { + t.Fatal(err) + } + val := encodeMessageValue(t, map[string]any{ + "message_id": "m1", + "body": []byte("payload"), + "send_timestamp_millis": 1000, + "queue_generation": 1, + "visible_at_millis": 5000, + "receive_count": 3, + "first_receive_millis": 4500, + }) + if err := enc.HandleMessageData(EncodeMsgDataKey(queue, 1, "m1"), val); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + recs := readMessagesJSONL(t, filepath.Join(root, "sqs", queue, "messages.jsonl")) + if floatField(t, recs[0], "visible_at_millis") != 5000 { + t.Fatalf("visible_at_millis = %v want 5000", recs[0]["visible_at_millis"]) + } + if floatField(t, recs[0], "receive_count") != 3 { + t.Fatalf("receive_count = %v want 3", recs[0]["receive_count"]) + } +} + +func TestSQS_OrphanMessagesEmitWarning(t *testing.T) { + t.Parallel() + enc, _ := newSQSEncoder(t) + var events []string + enc.WithWarnSink(func(event string, fields ...any) { + events = append(events, event) + }) + // Message arrives before the queue meta record (typical lex order) + // AND no meta record arrives at all (e.g., the queue was deleted + // before the snapshot was taken). The encoder buffers, then warns. + val := encodeMessageValue(t, map[string]any{ + "message_id": "stranded", + "body": []byte("orphan"), + "send_timestamp_millis": 1, + "queue_generation": 1, + }) + if err := enc.HandleMessageData(EncodeMsgDataKey("ghost-queue", 1, "stranded"), val); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if len(events) != 1 || events[0] != "sqs_orphan_messages" { + t.Fatalf("events = %v want [sqs_orphan_messages]", events) + } +} + +func TestSQS_RejectsValueWithoutMagicPrefix(t *testing.T) { + t.Parallel() + enc, _ := newSQSEncoder(t) + t.Run("queue-meta", func(t *testing.T) { + err := enc.HandleQueueMeta(EncodeQueueMetaKey("q"), []byte(`{"name":"q"}`)) + if !errors.Is(err, ErrSQSInvalidQueueMeta) { + t.Fatalf("err=%v", err) + } + }) + t.Run("message", func(t *testing.T) { + err := enc.HandleMessageData(EncodeMsgDataKey("q", 1, "m"), []byte(`{"message_id":"m"}`)) + if !errors.Is(err, ErrSQSInvalidMessage) { + t.Fatalf("err=%v", err) + } + }) +} + +func TestSQS_RejectsTrailingGarbageAfterMagic(t *testing.T) { + t.Parallel() + enc, _ := newSQSEncoder(t) + bad := append([]byte{}, storedSQSMsgMagic...) + bad = append(bad, []byte("not json")...) + err := enc.HandleMessageData(EncodeMsgDataKey("q", 1, "m"), bad) + if !errors.Is(err, ErrSQSInvalidMessage) { + t.Fatalf("err=%v", err) + } +} + +func TestSQS_RejectsKeyWithWrongPrefix(t *testing.T) { + t.Parallel() + enc, _ := newSQSEncoder(t) + err := enc.HandleQueueMeta([]byte("!unknown|prefix|q"), encodeQueueMetaValue(t, map[string]any{})) + if !errors.Is(err, ErrSQSMalformedKey) { + t.Fatalf("err=%v", err) + } +} + +func TestSQS_PeekMsgDataKeyParsesComponents(t *testing.T) { + t.Parallel() + key := EncodeMsgDataKey("orders", 7, "msg-id") + got, err := peekMsgDataKey(key) + if err != nil { + t.Fatalf("peekMsgDataKey: %v", err) + } + if got.Encoded != "b3JkZXJz" { // base64url("orders") + t.Fatalf("encoded queue = %q", got.Encoded) + } + if len(got.GenRaw) != genBytes { + t.Fatalf("gen length = %d", len(got.GenRaw)) + } + if got.GenRaw[7] != 7 { // BE: high bytes zero, low byte = 7 + t.Fatalf("gen low byte = %x", got.GenRaw[7]) + } + if got.MsgID != "bXNnLWlk" { // base64url("msg-id") + t.Fatalf("msg id = %q", got.MsgID) + } +} + +func TestSQS_IncludeSideRecordsBuffersBetweenFinalize(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + enc.WithIncludeSideRecords(true) + queue := "q" + if err := enc.HandleQueueMeta(EncodeQueueMetaKey(queue), encodeQueueMetaValue(t, map[string]any{ + "name": queue, "visibility_timeout_seconds": 30, "message_retention_seconds": 60, + })); err != nil { + t.Fatal(err) + } + val := encodeMessageValue(t, map[string]any{ + "message_id": "m", "body": []byte("v"), "send_timestamp_millis": 1, "queue_generation": 1, + }) + if err := enc.HandleMessageData(EncodeMsgDataKey(queue, 1, "m"), val); err != nil { + t.Fatal(err) + } + // Synthesise a fake vis side-record. parseSQSGenericKey only looks + // at the encoded queue prefix, so a payload-shaped tail is fine. + visKey := append([]byte(SQSMsgVisPrefix), []byte("cQ")...) // base64url("q") + visKey = append(visKey, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01) + if err := enc.HandleSideRecord(SQSMsgVisPrefix, visKey, []byte("ignored-payload")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + side := filepath.Join(root, "sqs", queue, "_internals", "side_records.jsonl") + if _, err := os.Stat(side); err != nil { + t.Fatalf("expected side file: %v", err) + } +} + +func TestSQS_DefaultDoesNotEmitInternals(t *testing.T) { + t.Parallel() + enc, root := newSQSEncoder(t) + queue := "q" + if err := enc.HandleQueueMeta(EncodeQueueMetaKey(queue), encodeQueueMetaValue(t, map[string]any{ + "name": queue, "visibility_timeout_seconds": 30, "message_retention_seconds": 60, + })); err != nil { + t.Fatal(err) + } + visKey := append([]byte(SQSMsgVisPrefix), []byte("cQ")...) + visKey = append(visKey, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01) + if err := enc.HandleSideRecord(SQSMsgVisPrefix, visKey, []byte("ignored")); err != nil { + t.Fatal(err) + } + if err := enc.Finalize(); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(root, "sqs", queue, "_internals")); !os.IsNotExist(err) { + t.Fatalf("expected no _internals dir without --include-sqs-side-records, stat err=%v", err) + } +} diff --git a/internal/s3keys/keys.go b/internal/s3keys/keys.go index 5441999f..fe4f892c 100644 --- a/internal/s3keys/keys.go +++ b/internal/s3keys/keys.go @@ -241,6 +241,88 @@ func ObjectManifestScanStart(bucket string, generation uint64, objectPrefix stri return out } +// ParseBlobKey decodes a !s3|blob| key into its components. Both the +// 6-component form produced by BlobKey and the 7-component form +// produced by VersionedBlobKey (with partVersion != 0) are supported; +// partVersion is reported as zero for the un-versioned form. +// +// Returns ok=false on any parse failure (truncated key, malformed +// segment, junk trailer). Used by the offline backup decoder +// (internal/backup) to route blob chunks to their assembled object, +// and by future readers that need to walk the blob keyspace without +// holding a live cluster. +func ParseBlobKey(key []byte) (bucket string, generation uint64, object string, uploadID string, partNo uint64, chunkNo uint64, partVersion uint64, ok bool) { + if !bytes.HasPrefix(key, blobPrefixBytes) { + return "", 0, "", "", 0, 0, 0, false + } + parts, ok := parseBlobKeyHead(key) + if !ok { + return "", 0, "", "", 0, 0, 0, false + } + partVersion, ok = parseOptionalPartVersion(key, parts.next) + if !ok { + return "", 0, "", "", 0, 0, 0, false + } + return parts.bucket, parts.generation, parts.object, parts.uploadID, parts.partNo, parts.chunkNo, partVersion, true +} + +// parsedBlobHead is the 6-component head of a blob key. The optional +// partVersion trailer is parsed separately so cyclomatic complexity +// stays under the package cap. +type parsedBlobHead struct { + bucket string + generation uint64 + object string + uploadID string + partNo uint64 + chunkNo uint64 + next int +} + +func parseBlobKeyHead(key []byte) (parsedBlobHead, bool) { + var p parsedBlobHead + bucketRaw, next, ok := decodeSegment(key, len(blobPrefixBytes)) + if !ok { + return p, false + } + if p.generation, next, ok = readU64(key, next); !ok { + return p, false + } + objectRaw, next, ok := decodeSegment(key, next) + if !ok { + return p, false + } + uploadIDRaw, next, ok := decodeSegment(key, next) + if !ok { + return p, false + } + if p.partNo, next, ok = readU64(key, next); !ok { + return p, false + } + if p.chunkNo, next, ok = readU64(key, next); !ok { + return p, false + } + p.bucket = string(bucketRaw) + p.object = string(objectRaw) + p.uploadID = string(uploadIDRaw) + p.next = next + return p, true +} + +func parseOptionalPartVersion(key []byte, offset int) (uint64, bool) { + switch { + case offset == len(key): + return 0, true + case len(key)-offset == u64Bytes: + v, next, ok := readU64(key, offset) + if !ok || next != len(key) { + return 0, false + } + return v, true + } + return 0, false +} + func ParseObjectManifestKey(key []byte) (bucket string, generation uint64, object string, ok bool) { if !bytes.HasPrefix(key, objectManifestPrefixBytes) { return "", 0, "", false diff --git a/internal/s3keys/keys_test.go b/internal/s3keys/keys_test.go index 3861e7f8..cbf6dfe5 100644 --- a/internal/s3keys/keys_test.go +++ b/internal/s3keys/keys_test.go @@ -117,6 +117,76 @@ func TestParseUploadPartKey_ZeroBytesInSegments(t *testing.T) { require.Equal(t, uint64(3), partNo) } +func TestParseBlobKey_UnversionedRoundTrip(t *testing.T) { + t.Parallel() + + bucket := "photos" + gen := uint64(7) + object := "2026/04/img.jpg" + uploadID := "u-abc" + partNo := uint64(3) + chunkNo := uint64(5) + + key := BlobKey(bucket, gen, object, uploadID, partNo, chunkNo) + gotBucket, gotGen, gotObject, gotUpload, gotPart, gotChunk, gotVersion, ok := ParseBlobKey(key) + require.True(t, ok) + require.Equal(t, bucket, gotBucket) + require.Equal(t, gen, gotGen) + require.Equal(t, object, gotObject) + require.Equal(t, uploadID, gotUpload) + require.Equal(t, partNo, gotPart) + require.Equal(t, chunkNo, gotChunk) + require.Equal(t, uint64(0), gotVersion, "unversioned blob key must report partVersion=0") +} + +func TestParseBlobKey_VersionedRoundTrip(t *testing.T) { + t.Parallel() + + key := VersionedBlobKey("b", 1, "o", "u", 2, 3, 9) + _, _, _, _, gotPart, gotChunk, gotVersion, ok := ParseBlobKey(key) + require.True(t, ok) + require.Equal(t, uint64(2), gotPart) + require.Equal(t, uint64(3), gotChunk) + require.Equal(t, uint64(9), gotVersion) +} + +func TestParseBlobKey_VersionedZeroFallsBackToUnversioned(t *testing.T) { + t.Parallel() + + // VersionedBlobKey(partVersion=0) is documented to fall back to + // the un-versioned shape; ParseBlobKey must agree. + key := VersionedBlobKey("b", 1, "o", "u", 2, 3, 0) + require.True(t, bytes.Equal(key, BlobKey("b", 1, "o", "u", 2, 3))) + _, _, _, _, _, _, gotVersion, ok := ParseBlobKey(key) + require.True(t, ok) + require.Equal(t, uint64(0), gotVersion) +} + +func TestParseBlobKey_RejectsNonBlob(t *testing.T) { + t.Parallel() + + cases := [][]byte{ + BucketMetaKey("b"), + ObjectManifestKey("b", 1, "o"), + UploadPartKey("b", 1, "o", "u", 1), + []byte("not-a-key"), + } + for _, k := range cases { + _, _, _, _, _, _, _, ok := ParseBlobKey(k) + require.False(t, ok, "expected ParseBlobKey to reject %q", k) + } +} + +func TestParseBlobKey_RejectsTrailingGarbage(t *testing.T) { + t.Parallel() + + key := BlobKey("b", 1, "o", "u", 2, 3) + bad := append([]byte{}, key...) + bad = append(bad, 0x00, 0x00, 0x00, 0x00) // 4 trailing bytes -- not 0 and not u64Bytes + _, _, _, _, _, _, _, ok := ParseBlobKey(bad) + require.False(t, ok) +} + func TestParseUploadPartKey_RejectsNonPartKeys(t *testing.T) { t.Parallel()