Skip to content

Commit cd88e63

Browse files
committed
backup: canonicalize DynamoDB N keys + sort schema arrays (PR #716, round 4)
Codex round 9 raised two issues on commit d67baf8: 1. P1: ddbKeyAttrToSegment was emitting `EncodeSegment([]byte(v.N))` for numeric primary-key attributes, but DynamoDB N equality is numeric — `"1"` and `"1.0"` (or `"100"` and `"1e2"`, or `"0.5"` and `"5e-1"`) name the same logical item. In migration mode where source and active rows used different decimal text for the same value, both rows survived at distinct paths and restore replayed duplicates. Mirror the live adapter's canonicalNumberString (adapter/dynamodb.go:7651) which uses big.Rat — same canonical form keeps backup filenames in lockstep with the live equality check. 2. P2: schemaToPublic ranged over Go maps for both global_secondary_indexes and attribute_definitions, so identical snapshots produced different `_schema.json` byte output across runs. Sort by name before append. Tests: - TestDDB_CanonicalNumberKeySegment: equivalence pairs ("1"/"1.0", "100"/"1e2", "-0"/"0", "0.5"/"5e-1") collapse to the same key segment. - TestDDB_SchemaJSONIsDeterministic: 32 calls to schemaToPublic on the same schema produce identical attribute_definitions and GSI orders, both matching the documented sort-by-name.
1 parent d67baf8 commit cd88e63

2 files changed

Lines changed: 158 additions & 6 deletions

File tree

internal/backup/dynamodb.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import (
55
"encoding/base64"
66
"encoding/json"
77
"fmt"
8+
"math/big"
89
"os"
910
"path/filepath"
11+
"sort"
1012
"strconv"
1113
"strings"
1214

@@ -378,14 +380,40 @@ func ddbKeyAttrToSegment(av *pb.DynamoAttributeValue) (string, error) {
378380
case *pb.DynamoAttributeValue_S:
379381
return EncodeSegment([]byte(v.S)), nil
380382
case *pb.DynamoAttributeValue_N:
381-
return EncodeSegment([]byte(v.N)), nil
383+
// DynamoDB N equality is numeric, not lexical: "1" and
384+
// "1.0" name the same primary-key item, and so do "5e-1"
385+
// and "0.5". Without canonicalisation each text form
386+
// produces a distinct filename, so in migration mode the
387+
// "active generation wins" invariant breaks (both source
388+
// and active rows survive at different paths and restore
389+
// replays duplicates). Mirror the live adapter's
390+
// canonicalNumberString (adapter/dynamodb.go:7651) which
391+
// uses big.Rat — same canonical form keeps filename
392+
// identity in lockstep with the live equality check.
393+
// Codex P1 round 9.
394+
return EncodeSegment([]byte(canonicalDDBNumber(v.N))), nil
382395
case *pb.DynamoAttributeValue_B:
383396
return EncodeBinarySegment(v.B), nil
384397
}
385398
return "", errors.Wrapf(ErrDDBInvalidItem,
386399
"primary key has unsupported attribute kind %T", av.GetValue())
387400
}
388401

402+
// canonicalDDBNumber returns the canonical decimal representation of
403+
// a DynamoDB N value. Equivalent inputs (`"1"`, `"1.0"`, `"0.5"`,
404+
// `"5e-1"`, …) collapse to the same string; malformed inputs fall
405+
// through to a trimmed copy so a downstream parse failure surfaces
406+
// the original bytes rather than a silently rewritten value. The
407+
// implementation matches adapter/dynamodb.go canonicalNumberString
408+
// byte-for-byte so backup filenames track live equality.
409+
func canonicalDDBNumber(v string) string {
410+
rat := &big.Rat{}
411+
if _, ok := rat.SetString(strings.TrimSpace(v)); !ok {
412+
return strings.TrimSpace(v)
413+
}
414+
return rat.RatString()
415+
}
416+
389417
// schemaToPublic projects DynamoTableSchema into the AWS-DescribeTable
390418
// JSON shape documented in the design. Fields the live record carries
391419
// for cluster-internal reasons (key_encoding_version, generation,
@@ -407,8 +435,18 @@ func schemaToPublic(s *pb.DynamoTableSchema) ddbPublicSchema {
407435
if pk.RangeKey.Name != "" {
408436
pk.RangeKey.Type = defs[pk.RangeKey.Name]
409437
}
410-
gsis := make([]publicGSI, 0, len(s.GetGlobalSecondaryIndexes()))
411-
for name, gsi := range s.GetGlobalSecondaryIndexes() {
438+
// Build GSI list in deterministic name-sorted order. Ranging over
439+
// the underlying map directly produced a different array order on
440+
// every dump, undermining byte-for-byte reproducibility of
441+
// _schema.json across runs of the same snapshot. Codex P2 round 9.
442+
gsiNames := make([]string, 0, len(s.GetGlobalSecondaryIndexes()))
443+
for name := range s.GetGlobalSecondaryIndexes() {
444+
gsiNames = append(gsiNames, name)
445+
}
446+
sort.Strings(gsiNames)
447+
gsis := make([]publicGSI, 0, len(gsiNames))
448+
for _, name := range gsiNames {
449+
gsi := s.GetGlobalSecondaryIndexes()[name]
412450
g := publicGSI{
413451
Name: name,
414452
KeySchema: publicKeySchema{
@@ -426,9 +464,16 @@ func schemaToPublic(s *pb.DynamoTableSchema) ddbPublicSchema {
426464
g.Projection.NonKeyAttributes = append([]string{}, gsi.GetProjection().GetNonKeyAttributes()...)
427465
gsis = append(gsis, g)
428466
}
429-
attrDefs := make([]publicAttributeDefinition, 0, len(defs))
430-
for name, ty := range defs {
431-
attrDefs = append(attrDefs, publicAttributeDefinition{Name: name, Type: ty})
467+
// AttributeDefinitions is also sorted by attribute name for the
468+
// same determinism reason.
469+
defNames := make([]string, 0, len(defs))
470+
for name := range defs {
471+
defNames = append(defNames, name)
472+
}
473+
sort.Strings(defNames)
474+
attrDefs := make([]publicAttributeDefinition, 0, len(defNames))
475+
for _, name := range defNames {
476+
attrDefs = append(attrDefs, publicAttributeDefinition{Name: name, Type: defs[name]})
432477
}
433478
return ddbPublicSchema{
434479
FormatVersion: 1,

internal/backup/dynamodb_test.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,113 @@ func TestDDB_MigrationSourceGenerationItemsAreEmitted(t *testing.T) {
375375
}
376376
}
377377

378+
// TestDDB_CanonicalNumberKeySegment is the regression for Codex P1
379+
// round 9: DynamoDB N equality is numeric, not lexical, but the key
380+
// segment was emitted as `EncodeSegment([]byte(v.N))`. In migration
381+
// mode where source and active rows used different decimal text for
382+
// the same logical N value (e.g. "1" and "1.0"), both rows survived
383+
// at distinct paths and restore replayed duplicates. The encoder
384+
// must canonicalise via big.Rat — same canonical form as the live
385+
// adapter — so equivalent N literals collapse onto the same filename.
386+
func TestDDB_CanonicalNumberKeySegment(t *testing.T) {
387+
t.Parallel()
388+
cases := []struct {
389+
a, b string
390+
}{
391+
{"1", "1.0"},
392+
{"100", "1e2"},
393+
{"-0", "0"},
394+
{"0.5", "5e-1"},
395+
}
396+
for _, tc := range cases {
397+
t.Run(tc.a+"_vs_"+tc.b, func(t *testing.T) {
398+
t.Parallel()
399+
gotA, errA := ddbKeyAttrToSegment(nAttr(tc.a))
400+
gotB, errB := ddbKeyAttrToSegment(nAttr(tc.b))
401+
if errA != nil || errB != nil {
402+
t.Fatalf("err: %v / %v", errA, errB)
403+
}
404+
if gotA != gotB {
405+
t.Fatalf("equivalent N values must canonicalise to the same segment: %q vs %q -> %q vs %q",
406+
tc.a, tc.b, gotA, gotB)
407+
}
408+
})
409+
}
410+
}
411+
412+
// TestDDB_SchemaJSONIsDeterministic is the regression for Codex P2
413+
// round 9: schemaToPublic ranged over Go maps for both
414+
// global_secondary_indexes and attribute_definitions, so identical
415+
// snapshots produced different `_schema.json` byte output across
416+
// runs. The keys are now sorted before append.
417+
func TestDDB_SchemaJSONIsDeterministic(t *testing.T) {
418+
t.Parallel()
419+
schema := &pb.DynamoTableSchema{
420+
TableName: "t",
421+
PrimaryKey: &pb.DynamoKeySchema{HashKey: "id"},
422+
AttributeDefinitions: map[string]string{
423+
"zeta": "S", "alpha": "S", "id": "S", "mu": "N",
424+
},
425+
GlobalSecondaryIndexes: map[string]*pb.DynamoGlobalSecondaryIndex{
426+
"gZ": {KeySchema: &pb.DynamoKeySchema{HashKey: "zeta"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}},
427+
"gA": {KeySchema: &pb.DynamoKeySchema{HashKey: "alpha"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}},
428+
"gM": {KeySchema: &pb.DynamoKeySchema{HashKey: "mu"}, Projection: &pb.DynamoGSIProjection{ProjectionType: "ALL"}},
429+
},
430+
Generation: 1,
431+
}
432+
// Run schemaToPublic many times — Go's randomised map order
433+
// would otherwise produce different array orders across calls.
434+
want := schemaToPublic(schema)
435+
for i := 0; i < 32; i++ {
436+
got := schemaToPublic(schema)
437+
if !attributeDefinitionsEqual(got.AttributeDefinitions, want.AttributeDefinitions) {
438+
t.Fatalf("attribute_definitions order differs across calls: %+v vs %+v",
439+
got.AttributeDefinitions, want.AttributeDefinitions)
440+
}
441+
if !gsiOrderEqual(got.GlobalSecondaryIndexes, want.GlobalSecondaryIndexes) {
442+
t.Fatalf("global_secondary_indexes order differs across calls: %+v vs %+v",
443+
got.GlobalSecondaryIndexes, want.GlobalSecondaryIndexes)
444+
}
445+
}
446+
// Also assert the order itself is the documented sort-by-name.
447+
wantAttrOrder := []string{"alpha", "id", "mu", "zeta"}
448+
for i, ad := range want.AttributeDefinitions {
449+
if ad.Name != wantAttrOrder[i] {
450+
t.Fatalf("attribute_definitions[%d].Name = %q want %q", i, ad.Name, wantAttrOrder[i])
451+
}
452+
}
453+
wantGSIOrder := []string{"gA", "gM", "gZ"}
454+
for i, g := range want.GlobalSecondaryIndexes {
455+
if g.Name != wantGSIOrder[i] {
456+
t.Fatalf("global_secondary_indexes[%d].Name = %q want %q", i, g.Name, wantGSIOrder[i])
457+
}
458+
}
459+
}
460+
461+
func attributeDefinitionsEqual(a, b []publicAttributeDefinition) bool {
462+
if len(a) != len(b) {
463+
return false
464+
}
465+
for i := range a {
466+
if a[i] != b[i] {
467+
return false
468+
}
469+
}
470+
return true
471+
}
472+
473+
func gsiOrderEqual(a, b []publicGSI) bool {
474+
if len(a) != len(b) {
475+
return false
476+
}
477+
for i := range a {
478+
if a[i].Name != b[i].Name {
479+
return false
480+
}
481+
}
482+
return true
483+
}
484+
378485
func TestDDB_NewGenerationWinsOverMigrationSourceForSameKey(t *testing.T) {
379486
t.Parallel()
380487
enc, root := newDDBEncoder(t)

0 commit comments

Comments
 (0)