diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c375fb83..524858d9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. ### Added - Schema: Added a `Decimal` common type carrying precision and scale via a new `LogicalParams` struct, enabling lossless conversion between Avro, Parquet, and database `NUMBER`/`NUMERIC` decimals. Includes `NewDecimal`, `FormatDecimal`/`ParseDecimal`, and `DecimalParams.Format`/`Parse`/`ValidateValue` helpers, plus a `Common.Validate` entry point. `ParseFromAny` and `InferFromAny` now accept `encoding/json.Number` values, so schemas pipelined through `json.Decoder.UseNumber()` round-trip without precision loss. (@Jeffail) +- Schema: Added a `BigDecimal` common type for arbitrary-precision decimals, alongside `NewBigDecimal`, `FormatBigDecimal`, and `ParseBigDecimal` helpers. Use it for sources that lack column-level precision (Postgres `numeric` without `(p, s)`, Oracle `NUMBER` with no `DATA_PRECISION`, MongoDB `Decimal128`). `Common.Validate` enforces `BigDecimal` as a leaf type with no logical parameters. (@Jeffail) ## 4.70.0 - 2026-04-02 diff --git a/public/schema/bigdecimal.go b/public/schema/bigdecimal.go new file mode 100644 index 000000000..e349c60b8 --- /dev/null +++ b/public/schema/bigdecimal.go @@ -0,0 +1,64 @@ +// Copyright 2026 Redpanda Data, Inc. + +package schema + +import ( + "fmt" + "math/big" +) + +// NewBigDecimal constructs a Common schema for a [BigDecimal] column — +// an arbitrary-precision decimal with no schema-level precision or scale +// commitment. Use this for sources where the column type does not carry +// fixed precision and scale (e.g. unparameterised Postgres NUMERIC, Oracle +// NUMBER without DATA_PRECISION, MongoDB Decimal128). +func NewBigDecimal(name string, optional bool) Common { + return Common{ + Name: name, + Type: BigDecimal, + Optional: optional, + } +} + +// FormatBigDecimal renders an unscaled integer at the given scale as a +// canonical [BigDecimal] string. Output rules match [FormatDecimal]: leading +// minus for negatives only, no leading plus, no leading zeros aside from a +// single "0" before the decimal point, decimal point present iff scale > 0, +// exactly scale fractional digits emitted. +// +// Unlike [DecimalParams.Format], the [BigDecimal] schema imposes no fixed +// scale; callers pick the scale that matches the source value's natural +// precision. The scale parameter must be non-negative. +func FormatBigDecimal(unscaled *big.Int, scale int32) (string, error) { + return FormatDecimal(unscaled, scale) +} + +// ParseBigDecimal interprets s as a decimal-shaped string and returns the +// unscaled integer alongside the scale recovered from the number of +// fractional digits in the input. +// +// The accepted form matches [ParseDecimal]: lenient on non-canonical-but- +// unambiguous inputs (leading plus, leading zeros, missing integer part as +// in ".5"), strict on ambiguous or malformed inputs (scientific notation, +// multiple decimal points, whitespace, thousands separators, non-digit +// characters). Canonical form is enforced when values are re-emitted via +// [FormatBigDecimal]. Unlike [ParseDecimal], the scale is recovered from +// the input rather than supplied by the caller. +// +// The parser does not bound the input length. The underlying big.Int parse +// is super-linear, so callers exposing this directly to untrusted input +// should impose their own length cap. +func ParseBigDecimal(s string) (*big.Int, int32, error) { + sign, intPart, fracPart, err := parseCanonicalDecimal(s) + if err != nil { + return nil, 0, err + } + + raw := sign + intPart + fracPart + n, ok := new(big.Int).SetString(raw, 10) + if !ok { + return nil, 0, fmt.Errorf("failed to parse decimal value %q", s) + } + + return n, int32(len(fracPart)), nil +} diff --git a/public/schema/bigdecimal_test.go b/public/schema/bigdecimal_test.go new file mode 100644 index 000000000..1d02652a5 --- /dev/null +++ b/public/schema/bigdecimal_test.go @@ -0,0 +1,246 @@ +// Copyright 2026 Redpanda Data, Inc. + +package schema + +import ( + "math/big" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewBigDecimal(t *testing.T) { + c := NewBigDecimal("amount", true) + assert.Equal(t, BigDecimal, c.Type) + assert.Equal(t, "amount", c.Name) + assert.True(t, c.Optional) + assert.Nil(t, c.Logical) + assert.NoError(t, c.Validate()) +} + +func TestBigDecimalToAnyOmitsParams(t *testing.T) { + c := NewBigDecimal("x", false) + m, ok := c.ToAny().(map[string]any) + require.True(t, ok) + + assert.Equal(t, "BIG_DECIMAL", m[anyFieldType]) + _, hasPrecision := m[anyFieldPrecision] + _, hasScale := m[anyFieldScale] + assert.False(t, hasPrecision) + assert.False(t, hasScale) +} + +func TestBigDecimalRoundTrip(t *testing.T) { + original := NewBigDecimal("balance", true) + parsed, err := ParseFromAny(original.ToAny()) + require.NoError(t, err) + assert.Equal(t, original.Type, parsed.Type) + assert.Equal(t, original.Name, parsed.Name) + assert.Equal(t, original.Optional, parsed.Optional) + assert.Nil(t, parsed.Logical) + assert.Equal(t, original.fingerprint(), parsed.fingerprint()) +} + +func TestBigDecimalParseFromAnyRejectsParams(t *testing.T) { + in := map[string]any{ + anyFieldType: "BIG_DECIMAL", + anyFieldName: "x", + anyFieldPrecision: int64(10), + anyFieldScale: int64(2), + } + _, err := ParseFromAny(in) + require.Error(t, err) + assert.Contains(t, err.Error(), "only valid for type DECIMAL") +} + +func TestBigDecimalValidateRejectsLogicalDecimal(t *testing.T) { + c := Common{ + Type: BigDecimal, + Name: "x", + Logical: &LogicalParams{Decimal: &DecimalParams{Precision: 10, Scale: 2}}, + } + err := c.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "only valid for type DECIMAL") +} + +func TestBigDecimalValidateRejectsChildren(t *testing.T) { + c := Common{ + Type: BigDecimal, + Name: "x", + Children: []Common{{Type: String, Name: "weird"}}, + } + err := c.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "is a leaf and must not have children") +} + +func TestFormatBigDecimal(t *testing.T) { + tests := []struct { + name string + unscaled string + scale int32 + want string + }{ + {"zero scale zero", "0", 0, "0"}, + {"zero scale four", "0", 4, "0.0000"}, + {"twelve thousand scale four", "12345", 4, "1.2345"}, + {"negative one scale four", "-1", 4, "-0.0001"}, + {"large scale", "1", 30, "0." + strings.Repeat("0", 29) + "1"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + n, ok := new(big.Int).SetString(tt.unscaled, 10) + require.True(t, ok) + got, err := FormatBigDecimal(n, tt.scale) + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestFormatBigDecimalNoNegativeZero(t *testing.T) { + // big.Int has no concept of -0 — Sign() returns 0 for zero values, so + // we never emit a leading minus on a zero magnitude. Verify both + // constructions land on the same canonical zero string. + zeroPos := big.NewInt(0) + zeroNeg := new(big.Int).Neg(big.NewInt(0)) + + got, err := FormatBigDecimal(zeroPos, 4) + require.NoError(t, err) + assert.Equal(t, "0.0000", got) + + got, err = FormatBigDecimal(zeroNeg, 4) + require.NoError(t, err) + assert.Equal(t, "0.0000", got) +} + +func TestParseBigDecimal(t *testing.T) { + tests := []struct { + name string + input string + unscaled string + scale int32 + }{ + {"integer", "12345", "12345", 0}, + {"negative integer", "-12345", "-12345", 0}, + {"fractional", "1.5", "15", 1}, + {"three fractional", "1.500", "1500", 3}, + {"zero", "0", "0", 0}, + {"zero with scale", "0.0000", "0", 4}, + {"negative fractional", "-0.0001", "-1", 4}, + {"trailing dot", "1.", "1", 0}, + {"high scale", "0." + strings.Repeat("0", 29) + "1", "1", 30}, + // Lenient acceptance — non-canonical but unambiguous. + {"leading plus", "+1.5", "15", 1}, + {"leading zero", "01.5", "15", 1}, + {"leading zeros multiple", "-001", "-1", 0}, + {"missing integer part", ".5", "5", 1}, + {"missing integer part with sign", "-.5", "-5", 1}, + {"plus and missing integer", "+.5", "5", 1}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + n, scale, err := ParseBigDecimal(tt.input) + require.NoError(t, err) + assert.Equal(t, tt.unscaled, n.String()) + assert.Equal(t, tt.scale, scale) + }) + } +} + +func TestParseBigDecimalErrors(t *testing.T) { + tests := []struct { + name string + input string + wantErr string + }{ + {"empty", "", "must not be empty"}, + {"just minus", "-", "no digits"}, + {"just plus", "+", "no digits"}, + {"just dot", ".", "no digits"}, + {"two dots", "1.2.3", "at most one decimal point"}, + {"non-digit", "1.2a", "non-digit"}, + {"scientific notation", "1e5", "non-digit"}, + {"whitespace", " 1.5", "non-digit"}, + {"thousands separator", "1,000", "non-digit"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, _, err := ParseBigDecimal(tt.input) + require.Error(t, err) + assert.Contains(t, err.Error(), tt.wantErr) + }) + } +} + +func TestBigDecimalFormatParseRoundTrip(t *testing.T) { + // Parse a canonical string, format the recovered (unscaled, scale) + // pair, and confirm we get the original string back. + values := []string{ + "0", + "0.0", + "0.0000", + "1", + "-1", + "1.5", + "-1.5", + "12345.6789", + "-12345.6789", + "0.000000000000000000000000000001", + } + + for _, v := range values { + t.Run(v, func(t *testing.T) { + unscaled, scale, err := ParseBigDecimal(v) + require.NoError(t, err) + got, err := FormatBigDecimal(unscaled, scale) + require.NoError(t, err) + assert.Equal(t, v, got) + }) + } +} + +func TestParseBigDecimalNormalisesToCanonical(t *testing.T) { + // Postel: lenient parse, strict emit. Non-canonical inputs that the + // parser accepts must come back out in canonical form when re-emitted + // via FormatBigDecimal. + tests := []struct { + input string + want string + }{ + {"+1.5", "1.5"}, + {"01.5", "1.5"}, + {"-001.5", "-1.5"}, + {".5", "0.5"}, + {"+.5", "0.5"}, + {"-.5", "-0.5"}, + {"+0.0001", "0.0001"}, + {"01.500", "1.500"}, // trailing zeros preserved (scale 3) + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + n, scale, err := ParseBigDecimal(tt.input) + require.NoError(t, err) + got, err := FormatBigDecimal(n, scale) + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestBigDecimalFingerprintDistinctFromDecimal(t *testing.T) { + bd := NewBigDecimal("amount", false) + d := Common{ + Type: Decimal, + Name: "amount", + Logical: &LogicalParams{Decimal: &DecimalParams{Precision: 10, Scale: 2}}, + } + assert.NotEqual(t, bd.fingerprint(), d.fingerprint()) +} diff --git a/public/schema/common.go b/public/schema/common.go index e920755a4..473b97532 100644 --- a/public/schema/common.go +++ b/public/schema/common.go @@ -74,21 +74,22 @@ type CommonType int // Supported common types const ( - Boolean CommonType = 1 - Int32 CommonType = 2 - Int64 CommonType = 3 - Float32 CommonType = 4 - Float64 CommonType = 5 - String CommonType = 6 - ByteArray CommonType = 7 - Object CommonType = 8 - Map CommonType = 9 - Array CommonType = 10 - Null CommonType = 11 - Union CommonType = 12 - Timestamp CommonType = 13 - Any CommonType = 14 - Decimal CommonType = 15 + Boolean CommonType = 1 + Int32 CommonType = 2 + Int64 CommonType = 3 + Float32 CommonType = 4 + Float64 CommonType = 5 + String CommonType = 6 + ByteArray CommonType = 7 + Object CommonType = 8 + Map CommonType = 9 + Array CommonType = 10 + Null CommonType = 11 + Union CommonType = 12 + Timestamp CommonType = 13 + Any CommonType = 14 + Decimal CommonType = 15 + BigDecimal CommonType = 16 ) // Decimal precision bounds. The upper bound matches the widest precision that @@ -131,6 +132,8 @@ func (t CommonType) String() string { return "ANY" case Decimal: return "DECIMAL" + case BigDecimal: + return "BIG_DECIMAL" default: return "UNKNOWN" } @@ -168,6 +171,8 @@ func typeFromStr(v string) (CommonType, error) { return Any, nil case "DECIMAL": return Decimal, nil + case "BIG_DECIMAL": + return BigDecimal, nil default: return 0, fmt.Errorf("unrecognised type string: %v", v) } @@ -406,12 +411,14 @@ func int32Bounded(n int64, key string) (int32, error) { } // Validate enforces the parameter invariants of parameterised types -// (currently only [Decimal]) and that no parameter blocks are attached to -// types that do not accept them. It recurses into [Common.Children]. +// — [Decimal] precision/scale bounds — and the structural invariant that +// leaf types do not carry children. The container types ([Object], [Map], +// [Array], [Union]) are the only types permitted to populate +// [Common.Children]. Validate recurses into Children. // -// Structural invariants — for example that an [Object] has children, or that -// a [Union] has more than one child — are not currently enforced; the -// validation surface may grow as new logical types arrive. +// Other structural invariants — for example that an [Object] has children, +// or that a [Union] has more than one child — are not currently enforced; +// the validation surface may grow as new logical types arrive. // // Schemas constructed via [ParseFromAny] are validated automatically. Schemas // constructed by struct literal should call Validate before being passed to @@ -432,6 +439,10 @@ func (c *Common) Validate() error { return fmt.Errorf("Logical.Decimal parameters are only valid for type DECIMAL, got %v", c.Type) } + if !c.isContainerType() && len(c.Children) > 0 { + return fmt.Errorf("type %v is a leaf and must not have children", c.Type) + } + for i, child := range c.Children { if err := child.Validate(); err != nil { return fmt.Errorf("child %d (%q): %w", i, child.Name, err) @@ -441,6 +452,19 @@ func (c *Common) Validate() error { return nil } +// isContainerType reports whether the schema's type is one of the container +// types — [Object], [Map], [Array], or [Union] — for which populating +// [Common.Children] is structurally meaningful. Every other type is a leaf +// and must have no children. +func (c *Common) isContainerType() bool { + switch c.Type { + case Object, Map, Array, Union: + return true + default: + return false + } +} + // Fingerprint returns a deterministic hash identifier for the schema structure. // Two schemas with the same structure will produce the same fingerprint, // regardless of memory location. This is useful for caching schema conversions diff --git a/public/schema/common_test.go b/public/schema/common_test.go index a3a77148c..dc8cdd7c1 100644 --- a/public/schema/common_test.go +++ b/public/schema/common_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestSchemaStringify(t *testing.T) { @@ -30,9 +31,57 @@ func TestSchemaStringify(t *testing.T) { {Input: Timestamp, Output: "TIMESTAMP"}, {Input: Any, Output: "ANY"}, {Input: Decimal, Output: "DECIMAL"}, + {Input: BigDecimal, Output: "BIG_DECIMAL"}, {Input: zeroType, Output: "UNKNOWN"}, {Input: CommonType(-1), Output: "UNKNOWN"}, } { assert.Equal(t, test.Input.String(), test.Output) } } + +func TestValidateRejectsChildrenOnLeafTypes(t *testing.T) { + leafTypes := []CommonType{ + Boolean, Int32, Int64, Float32, Float64, String, ByteArray, + Null, Timestamp, Any, BigDecimal, + } + + for _, typ := range leafTypes { + t.Run(typ.String(), func(t *testing.T) { + c := Common{ + Type: typ, + Name: "x", + Children: []Common{{Type: String, Name: "weird"}}, + } + err := c.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "is a leaf and must not have children") + }) + } +} + +func TestValidateAllowsChildrenOnContainerTypes(t *testing.T) { + containers := []CommonType{Object, Map, Array, Union} + + for _, typ := range containers { + t.Run(typ.String(), func(t *testing.T) { + c := Common{ + Type: typ, + Name: "x", + Children: []Common{{Type: String, Name: "field"}}, + } + assert.NoError(t, c.Validate()) + }) + } +} + +func TestValidateRejectsChildrenOnDecimal(t *testing.T) { + c := Common{ + Type: Decimal, + Name: "amount", + Logical: &LogicalParams{Decimal: &DecimalParams{Precision: 10, Scale: 2}}, + Children: []Common{{Type: String, Name: "weird"}}, + } + err := c.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "is a leaf and must not have children") +} diff --git a/public/schema/decimal.go b/public/schema/decimal.go index 3303e5751..be5f03221 100644 --- a/public/schema/decimal.go +++ b/public/schema/decimal.go @@ -5,6 +5,7 @@ package schema import ( "errors" "fmt" + "math" "math/big" "strings" ) @@ -73,66 +74,110 @@ func FormatDecimal(unscaled *big.Int, scale int32) (string, error) { return sign + abs[:splitAt] + "." + abs[splitAt:], nil } -// ParseDecimal interprets s as a canonical decimal string and returns the +// ParseDecimal interprets s as a decimal-shaped string and returns the // unscaled integer at the given scale. Inputs with fewer fractional digits -// than scale are accepted and right-padded with zeros; inputs with more -// fractional digits than scale are rejected. +// than scale are right-padded with zeros; inputs with more fractional digits +// than scale are rejected. // -// Scientific notation, leading plus signs, thousands separators, multiple -// decimal points, and whitespace are not accepted. The integer part of the -// number is required (".5" is not accepted; use "0.5"). The scale parameter -// must be non-negative. +// The parser is lenient: leading plus signs, leading zeros, and inputs +// missing the integer part (e.g. ".5") are accepted and normalised. The +// parser is strict about ambiguous or malformed inputs — scientific +// notation, multiple decimal points, whitespace, thousands separators, and +// non-digit characters are rejected. Canonical form is enforced when values +// are re-emitted via [FormatDecimal]. The scale parameter must be +// non-negative. // // Precision is not enforced here; use [DecimalParams.Parse] when both // precision and scale must be checked. +// +// The parser does not bound the input length. The underlying big.Int parse +// is super-linear, so callers exposing this directly to untrusted input +// should impose their own length cap. func ParseDecimal(s string, scale int32) (*big.Int, error) { if scale < 0 { return nil, fmt.Errorf("scale must be non-negative, got %d", scale) } + + sign, intPart, fracPart, err := parseCanonicalDecimal(s) + if err != nil { + return nil, err + } + + if int32(len(fracPart)) > scale { + return nil, fmt.Errorf("decimal string has %d fractional digits, exceeds scale %d", len(fracPart), scale) + } + + padded := fracPart + strings.Repeat("0", int(scale)-len(fracPart)) + raw := sign + intPart + padded + + n, ok := new(big.Int).SetString(raw, 10) + if !ok { + return nil, fmt.Errorf("failed to parse decimal value %q", s) + } + return n, nil +} + +// parseCanonicalDecimal parses a decimal-shaped string into its components +// without applying any scale-specific transformation. The returned sign is "" +// or "-", and intPart and fracPart are digit-only strings (fracPart may be +// empty). +// +// Parsing follows Postel's principle: the function is lenient about +// non-canonical-but-unambiguous inputs (leading "+" sign, leading zeros, +// missing integer part as in ".5") and strict about ambiguous or malformed +// inputs (scientific notation, multiple decimal points, whitespace, +// thousands separators, non-digit characters). The canonical-form guarantee +// is upheld at the emit boundary by [FormatDecimal] and [FormatBigDecimal]; +// strict parsing would duplicate that responsibility at a less useful layer. +func parseCanonicalDecimal(s string) (sign, intPart, fracPart string, err error) { if s == "" { - return nil, errors.New("decimal string must not be empty") + return "", "", "", errors.New("decimal string must not be empty") } rest := s - sign := "" switch rest[0] { case '-': sign = "-" rest = rest[1:] case '+': - return nil, errors.New("decimal string must not have a leading plus sign") + // Leading plus is unambiguous and accepted; the canonical form just + // omits it. + rest = rest[1:] } if rest == "" { - return nil, errors.New("decimal string has no digits") + return "", "", "", errors.New("decimal string has no digits") } - intPart, fracPart, hasDot := strings.Cut(rest, ".") + var hasDot bool + intPart, fracPart, hasDot = strings.Cut(rest, ".") if hasDot && strings.Contains(fracPart, ".") { - return nil, errors.New("decimal string must contain at most one decimal point") + return "", "", "", errors.New("decimal string must contain at most one decimal point") } if intPart == "" { - return nil, errors.New("decimal string is missing the integer part") + if !hasDot || fracPart == "" { + return "", "", "", errors.New("decimal string has no digits") + } + // Inputs like ".5" — accept and treat as "0.5". The canonical emit + // path will produce the leading zero on the way out. + intPart = "0" } if err := requireDigits(intPart); err != nil { - return nil, err + return "", "", "", err } if err := requireDigits(fracPart); err != nil { - return nil, err + return "", "", "", err } - if int32(len(fracPart)) > scale { - return nil, fmt.Errorf("decimal string has %d fractional digits, exceeds scale %d", len(fracPart), scale) + // Cap the fractional length so callers downstream can safely cast it to + // the int32 scale type without silent wrap-around. The integer part is + // not bounded here — its length only feeds big.Int.SetString, which + // handles arbitrary lengths correctly (if slowly). + if len(fracPart) > math.MaxInt32 { + return "", "", "", fmt.Errorf("decimal string has %d fractional digits, exceeds maximum %d", len(fracPart), math.MaxInt32) } - padded := fracPart + strings.Repeat("0", int(scale)-len(fracPart)) - raw := sign + intPart + padded - - n, ok := new(big.Int).SetString(raw, 10) - if !ok { - return nil, fmt.Errorf("failed to parse decimal value %q", s) - } - return n, nil + return sign, intPart, fracPart, nil } func requireDigits(s string) error { diff --git a/public/schema/decimal_test.go b/public/schema/decimal_test.go index a4fe61431..b4e61e17a 100644 --- a/public/schema/decimal_test.go +++ b/public/schema/decimal_test.go @@ -453,6 +453,16 @@ func TestParseDecimal(t *testing.T) { {"trailing dot with scale", "1.", 3, "1000"}, {"negative integer", "-123", 0, "-123"}, {"max precision", "12345678901234567890123456789012345678", 0, "12345678901234567890123456789012345678"}, + // Lenient acceptance — non-canonical but unambiguous inputs are + // normalised on the way out via FormatDecimal. + {"leading zero", "01", 0, "1"}, + {"leading zeros multiple", "001.5", 4, "15000"}, + {"leading zero negative", "-01.5", 4, "-15000"}, + {"leading plus", "+1", 0, "1"}, + {"leading plus fractional", "+1.5", 4, "15000"}, + {"missing integer part", ".5", 1, "5"}, + {"missing integer part with sign", "-.5", 1, "-5"}, + {"plus and missing integer", "+.5", 1, "5"}, } for _, tt := range tests { @@ -473,8 +483,8 @@ func TestParseDecimalErrors(t *testing.T) { }{ {"empty", "", 0, "must not be empty"}, {"just minus", "-", 0, "no digits"}, - {"leading plus", "+1", 0, "must not have a leading plus"}, - {"missing integer part", ".5", 1, "missing the integer part"}, + {"just plus", "+", 0, "no digits"}, + {"just dot", ".", 1, "no digits"}, {"two dots", "1.2.3", 1, "at most one decimal point"}, {"non-digit", "1.2a", 1, "non-digit"}, {"scientific notation", "1e5", 0, "non-digit"}, diff --git a/public/schema/decimal_types.md b/public/schema/decimal_types.md index c5775d362..c6dcffc4e 100644 --- a/public/schema/decimal_types.md +++ b/public/schema/decimal_types.md @@ -1,8 +1,10 @@ # Decimal types in `public/schema` -This document describes the `Decimal` common type and its parameterised -representation, and lays out the contracts that schema-format converters and -data-source plugins must honour when handling decimal values. +This document describes the two decimal common types — `Decimal` (fixed +precision and scale) and `BigDecimal` (arbitrary precision and scale, +recovered per-value) — and lays out the contracts that schema-format +converters and data-source plugins must honour when handling decimal +values. ## Why decimals are a special case @@ -113,6 +115,82 @@ there is no reliable way to recover precision and scale from a generic Go value without context. Decimal schemas must be constructed explicitly by data-source plugins from authoritative source metadata. +## BigDecimal: arbitrary-precision decimals + +Some real-world sources expose decimals where precision and/or scale are not +declared at the column level: + +- Postgres `numeric` declared without `(p, s)` (arbitrary precision). +- Oracle `NUMBER` with no `DATA_PRECISION` (Oracle's "floating decimal"). +- MongoDB `bson.Decimal128`, where precision and scale vary per value. + +`Decimal` cannot represent these honestly — there is nothing meaningful to put +in `precision` or `scale` at schema-discovery time. `BigDecimal` exists for +exactly this case. + +### Type and shape + +```go +const BigDecimal CommonType = 16 +``` + +`BigDecimal` stringifies as `"BIG_DECIMAL"`. It is a **leaf type** — no +children, no logical parameters. `Common.Validate()` rejects: + +- Any `Common` with `Type == BigDecimal` and `len(Children) > 0`. +- Any `Common` with `Logical.Decimal != nil` and `Type != Decimal`, which + covers `BigDecimal` having `Logical.Decimal` set. + +### Serialisation + +`ToAny` emits `{"type": "BIG_DECIMAL", "name": "...", "fingerprint": "..."}` +with no precision or scale fields. `ParseFromAny` accepts the type label and +rejects the precision/scale fields, since they are only valid on `Decimal`. + +The fingerprint canonical form includes the type identifier but no +`D:p:s|` segment. Existing fingerprints for non-decimal schemas remain +byte-stable. + +### Helpers + +```go +// NewBigDecimal constructs a leaf BigDecimal common schema. +func NewBigDecimal(name string, optional bool) Common + +// FormatBigDecimal renders an unscaled integer at the given scale as a +// canonical BigDecimal string. Output rules match FormatDecimal exactly. +func FormatBigDecimal(unscaled *big.Int, scale int32) (string, error) + +// ParseBigDecimal interprets a canonical BigDecimal string and returns the +// unscaled integer plus the scale recovered from the input. +func ParseBigDecimal(s string) (*big.Int, int32, error) +``` + +`ParseBigDecimal` differs from `ParseDecimal` in that the scale is recovered +from the input rather than supplied by the caller — this is the entire point +of `BigDecimal`. + +### Value contract + +The canonical-string contract for `BigDecimal` values mirrors `Decimal`'s +with two relaxations: + +- **No fixed scale.** Each value carries its natural scale. `"1"`, `"1.5"`, + and `"1.500"` are all valid `BigDecimal` values; the third explicitly + encodes a scale of 3. +- **No mandated trailing-zero padding.** Sources emit values at the source's + natural scale rather than padding to a fixed column scale. + +All other rules — no scientific notation, no leading zeros (other than a +single `"0"` before the decimal point), no leading `+`, optional leading `-`, +integer part required, no thousands separators or whitespace — are unchanged +from `Decimal`. + +### Inference + +Like `Decimal`, `BigDecimal` is not inferred by `InferFromAny`. Sources must +construct it explicitly from authoritative source metadata. + ## Contract for schema-format converters Converters live outside this package (Avro, Parquet, Iceberg, JSON Schema, @@ -133,6 +211,26 @@ When **producing** a `Common` schema from a format-native schema, the converter constructs `&LogicalParams{Decimal: &DecimalParams{...}}` from the source precision and scale and runs `Common.Validate()` before returning. +### BigDecimal handling + +When a converter encounters a `BigDecimal` common schema, the correct +behaviour depends on whether the target format has a native arbitrary-precision +decimal: + +- **Formats with bounded fixed-precision decimals** (Avro, Parquet, Iceberg). + Reject `BigDecimal` with a clear error pointing the user to coerce the + value upstream into a fixed-precision form. Silent truncation or guessing + a precision is forbidden — `BigDecimal` exists specifically because no + fixed precision was promised. +- **Formats without native decimals at all** (JSON Schema, Protobuf). + Emit a permissive string representation (e.g. JSON Schema + `{"type": "string", "pattern": "^-?(0|[1-9][0-9]*)(\\.[0-9]+)?$"}`). + +When **producing** a `Common` schema from a format-native schema, converters +that read formats with arbitrary-precision decimals (e.g. a JSON Schema with +no precision constraint) construct a `BigDecimal` directly via +`schema.NewBigDecimal`. + ### Avro Avro's `decimal` is a logical type built on top of `bytes` or `fixed`. @@ -185,9 +283,8 @@ cases. Sources reading from `NUMBER(p, s)` set `Precision = p` and `Scale = s`. The following conditions must be handled explicitly: -- `NUMBER` with **no** declared precision (Oracle's "floating decimal"): there - is no fixed precision to record. Sources must either pick a sentinel - precision (e.g. 38) and warn, or downgrade to `String`. +- `NUMBER` with **no** declared precision (Oracle's "floating decimal"): emit + `BigDecimal` via `schema.NewBigDecimal`. - `NUMBER` with declared precision but **no** scale: `Scale = 0`. - `NUMBER` with **negative** scale: not supported. Sources must either round to scale 0, downgrade to `String`, or refuse the column. @@ -198,17 +295,21 @@ These map directly: precision and scale from the column metadata translate straight to `DecimalParams`. Both databases enforce `0 ≤ scale ≤ precision`, so values from these sources will always validate. -`NUMERIC` columns with no precision (Postgres "arbitrary precision") fall into -the same bucket as undeclared Oracle `NUMBER`: pick a precision and warn, or -downgrade to `String`. +`NUMERIC` columns with no precision (Postgres "arbitrary precision") emit +`BigDecimal` via `schema.NewBigDecimal`. The same applies to any source +emitting a value type without column-level precision (e.g. MongoDB +`bson.Decimal128`). ### JSON Schema JSON Schema has no native decimal. Converters should map `Decimal` to `{"type": "string", "pattern": ...}` with a regex that matches the precision -and scale, and document the loss of arithmetic semantics in the -roundtripped schema. Inbound conversion (JSON Schema → common) cannot recover -`Decimal` and should retain the value as `String`. +and scale, and `BigDecimal` to a permissive +`{"type": "string", "pattern": "^-?(0|[1-9][0-9]*)(\\.[0-9]+)?$"}` that +accepts any canonical decimal string. Document the loss of arithmetic +semantics in the round-tripped schema. Inbound conversion (JSON Schema → +common) cannot recover either decimal type and should retain the value as +`String`. ## Contract for data-source plugins @@ -259,10 +360,12 @@ decimal value in **canonical string form**: - A leading minus sign for negative values; no leading plus sign. - No leading zeros except for the single `0` before a decimal point. -- A decimal point appears if and only if `scale > 0`. -- Exactly `scale` digits after the decimal point — sources must pad with - trailing zeros if necessary so that `"1.5"` for a `(p, 4)` column is - emitted as `"1.5000"`. +- A decimal point appears if and only if `scale > 0` (for `Decimal`) or if + the value has fractional digits (for `BigDecimal`). +- For `Decimal`: exactly `scale` digits after the decimal point — sources + must pad with trailing zeros so that `"1.5"` for a `(p, 4)` column is + emitted as `"1.5000"`. For `BigDecimal`: digits after the decimal point + reflect the value's natural scale, no padding required. - No scientific notation, thousands separators, or whitespace. Examples for `Precision=18, Scale=4`: @@ -273,6 +376,17 @@ Examples for `Precision=18, Scale=4`: | `-0.1` | `"-0.1000"` | | `0` | `"0.0000"` | +The contract above describes what emitters MUST produce. Parsers in this +package (`schema.ParseDecimal`, `schema.ParseBigDecimal`) deliberately +follow Postel's principle and are **lenient** about non-canonical-but- +unambiguous inputs — leading plus signs, leading zeros, and inputs missing +the integer part (e.g. `".5"`) are accepted and normalised. Parsers stay +strict about ambiguous or malformed inputs (scientific notation, multiple +decimal points, whitespace, thousands separators, non-digit characters). +Canonical form is always re-asserted on the way out: any value parsed +through `Parse*` and re-emitted through `Format*` lands in canonical form +regardless of the input shape. + Strings are chosen as the canonical form because they: - Survive JSON round-trips without floating-point loss. diff --git a/public/schema/fingerprint_test.go b/public/schema/fingerprint_test.go index 664c52408..72a849da1 100644 --- a/public/schema/fingerprint_test.go +++ b/public/schema/fingerprint_test.go @@ -273,6 +273,7 @@ func TestFingerprintAllTypes(t *testing.T) { {Type: Timestamp, Name: "test"}, {Type: Any, Name: "test"}, {Type: Decimal, Name: "test", Logical: &LogicalParams{Decimal: &DecimalParams{Precision: 10, Scale: 2}}}, + {Type: BigDecimal, Name: "test"}, } fingerprints := make(map[string]CommonType) diff --git a/public/schema/infer_from_any.go b/public/schema/infer_from_any.go index 2abfbdee3..6c618a7c5 100644 --- a/public/schema/infer_from_any.go +++ b/public/schema/infer_from_any.go @@ -88,8 +88,9 @@ func inferFromAny(name string, v any) (Common, error) { // [encoding/json.Number] values are inferred as Int64 when they parse as an // integer and as Float64 otherwise. // -// Parameterised logical types (e.g. Decimal) cannot be inferred from generic Go -// values and must be constructed explicitly. +// Decimal types (both [Decimal] and [BigDecimal]) cannot be inferred from +// generic Go values and must be constructed explicitly via [NewDecimal] or +// [NewBigDecimal]. // // All values will be recorded as non-optional. func InferFromAny(v any) (Common, error) {