Skip to content

Commit c318231

Browse files
committed
feat: Enhanced JSONB Parser with Decimal Support and Extended Syntax
1 parent dcaf261 commit c318231

18 files changed

Lines changed: 3591 additions & 194 deletions

File tree

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ ordered-float = { version = "5.0", default-features = false }
3737
rand = { version = "0.9.0", features = ["small_rng"] }
3838
ryu = "1.0"
3939
serde = "1.0"
40-
serde_json = { version = "1.0", default-features = false, features = ["std"] }
40+
serde_json = { version = "1.0", default-features = false, features = ["std", "arbitrary_precision"] }
4141

4242
[dev-dependencies]
4343
goldenfile = "1.8"
@@ -46,6 +46,7 @@ json-deserializer = "0.4.4"
4646
simd-json = "0.15.0"
4747
mockalloc = "0.1.2"
4848
criterion = "0.5.1"
49+
proptest = "1.7"
4950

5051
[features]
5152
default = ["databend", "serde_json/preserve_order"]

src/constants.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,16 @@ pub(crate) const TRUE_LEVEL: u8 = 3;
3535
pub(crate) const FALSE_LEVEL: u8 = 2;
3636
pub(crate) const EXTENSION_LEVEL: u8 = 1;
3737

38-
pub(crate) const TYPE_STRING: &str = "string";
39-
pub(crate) const TYPE_NULL: &str = "null";
40-
pub(crate) const TYPE_BOOLEAN: &str = "boolean";
41-
pub(crate) const TYPE_NUMBER: &str = "number";
42-
pub(crate) const TYPE_ARRAY: &str = "array";
43-
pub(crate) const TYPE_OBJECT: &str = "object";
44-
pub(crate) const TYPE_DECIMAL: &str = "decimal";
45-
pub(crate) const TYPE_BINARY: &str = "binary";
46-
pub(crate) const TYPE_DATE: &str = "date";
47-
pub(crate) const TYPE_TIMESTAMP: &str = "timestamp";
48-
pub(crate) const TYPE_TIMESTAMP_TZ: &str = "timestamp_tz";
49-
pub(crate) const TYPE_INTERVAL: &str = "interval";
38+
pub(crate) const TYPE_STRING: &str = "STRING";
39+
pub(crate) const TYPE_NULL: &str = "NULL_VALUE";
40+
pub(crate) const TYPE_BOOLEAN: &str = "BOOLEAN";
41+
pub(crate) const TYPE_INTEGER: &str = "INTEGER";
42+
pub(crate) const TYPE_ARRAY: &str = "ARRAY";
43+
pub(crate) const TYPE_OBJECT: &str = "OBJECT";
44+
pub(crate) const TYPE_DECIMAL: &str = "DECIMAL";
45+
pub(crate) const TYPE_DOUBLE: &str = "DOUBLE";
46+
pub(crate) const TYPE_BINARY: &str = "BINARY";
47+
pub(crate) const TYPE_DATE: &str = "DATE";
48+
pub(crate) const TYPE_TIMESTAMP: &str = "TIMESTAMP";
49+
pub(crate) const TYPE_TIMESTAMP_TZ: &str = "TIMESTAMP_TZ";
50+
pub(crate) const TYPE_INTERVAL: &str = "INTERVAL";

src/core/databend/de.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,10 @@ impl<'de> Deserializer<'de> {
213213
match num {
214214
Number::Int64(n) => T::from_i64(n).ok_or(Error::UnexpectedType),
215215
Number::UInt64(n) => T::from_u64(n).ok_or(Error::UnexpectedType),
216-
Number::Float64(_) | Number::Decimal128(_) | Number::Decimal256(_) => {
217-
Err(Error::UnexpectedType)
218-
}
216+
Number::Float64(_)
217+
| Number::Decimal64(_)
218+
| Number::Decimal128(_)
219+
| Number::Decimal256(_) => Err(Error::UnexpectedType),
219220
}
220221
}
221222

@@ -228,6 +229,10 @@ impl<'de> Deserializer<'de> {
228229
Number::Int64(n) => T::from_i64(n).ok_or(Error::UnexpectedType),
229230
Number::UInt64(n) => T::from_u64(n).ok_or(Error::UnexpectedType),
230231
Number::Float64(n) => T::from_f64(n).ok_or(Error::UnexpectedType),
232+
Number::Decimal64(v) => {
233+
let n = v.to_float64();
234+
T::from_f64(n).ok_or(Error::UnexpectedType)
235+
}
231236
Number::Decimal128(v) => {
232237
let n = v.to_float64();
233238
T::from_f64(n).ok_or(Error::UnexpectedType)
@@ -317,6 +322,10 @@ impl<'de> Deserializer<'de> {
317322
}
318323
}
319324
Number::Float64(i) => visitor.visit_f64(i),
325+
Number::Decimal64(i) => {
326+
let v = i.to_float64();
327+
visitor.visit_f64(v)
328+
}
320329
Number::Decimal128(i) => {
321330
let v = i.to_float64();
322331
visitor.visit_f64(v)

src/core/databend/ser.rs

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -487,20 +487,7 @@ impl Serialize for RawJsonb<'_> {
487487
NUMBER_TAG => {
488488
let num = Number::decode(&self.data[payload_start..payload_end])
489489
.map_err(|e| ser::Error::custom(format!("{e}")))?;
490-
491-
match num {
492-
Number::Int64(i) => serializer.serialize_i64(i),
493-
Number::UInt64(i) => serializer.serialize_u64(i),
494-
Number::Float64(i) => serializer.serialize_f64(i),
495-
Number::Decimal128(i) => {
496-
let v = i.to_float64();
497-
serializer.serialize_f64(v)
498-
}
499-
Number::Decimal256(i) => {
500-
let v = i.to_float64();
501-
serializer.serialize_f64(v)
502-
}
503-
}
490+
num.serialize(serializer)
504491
}
505492
STRING_TAG => {
506493
let s = unsafe {
@@ -539,19 +526,7 @@ impl Serialize for RawJsonb<'_> {
539526
NUMBER_TAG => {
540527
let num = Number::decode(&self.data[payload_start..payload_end])
541528
.map_err(|e| ser::Error::custom(format!("{e}")))?;
542-
match num {
543-
Number::Int64(i) => serialize_seq.serialize_element(&i)?,
544-
Number::UInt64(i) => serialize_seq.serialize_element(&i)?,
545-
Number::Float64(i) => serialize_seq.serialize_element(&i)?,
546-
Number::Decimal128(i) => {
547-
let v = i.to_float64();
548-
serialize_seq.serialize_element(&v)?
549-
}
550-
Number::Decimal256(i) => {
551-
let v = i.to_float64();
552-
serialize_seq.serialize_element(&v)?
553-
}
554-
}
529+
serialize_seq.serialize_element(&num)?;
555530
}
556531
STRING_TAG => {
557532
let s = unsafe {
@@ -624,19 +599,7 @@ impl Serialize for RawJsonb<'_> {
624599
NUMBER_TAG => {
625600
let num = Number::decode(&self.data[payload_start..payload_end])
626601
.map_err(|e| ser::Error::custom(format!("{e}")))?;
627-
match num {
628-
Number::Int64(i) => serialize_map.serialize_entry(&k, &i)?,
629-
Number::UInt64(i) => serialize_map.serialize_entry(&k, &i)?,
630-
Number::Float64(i) => serialize_map.serialize_entry(&k, &i)?,
631-
Number::Decimal128(i) => {
632-
let v = i.to_float64();
633-
serialize_map.serialize_entry(&k, &v)?
634-
}
635-
Number::Decimal256(i) => {
636-
let v = i.to_float64();
637-
serialize_map.serialize_entry(&k, &v)?
638-
}
639-
}
602+
serialize_map.serialize_entry(&k, &num)?;
640603
}
641604
STRING_TAG => {
642605
let s = unsafe {

src/core/databend/util.rs

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ use crate::extension::Timestamp;
3232
use crate::extension::TimestampTz;
3333
use crate::number::Decimal128;
3434
use crate::number::Decimal256;
35+
use crate::number::Decimal64;
3536
use crate::Number;
3637
use crate::OwnedJsonb;
3738
use crate::RawJsonb;
@@ -304,19 +305,23 @@ impl Number {
304305
writer.write_all(&v.to_be_bytes())?;
305306
Ok(9)
306307
}
308+
Self::Decimal64(v) => {
309+
writer.write_all(&[NUMBER_DECIMAL])?;
310+
writer.write_all(&v.value.to_be_bytes())?;
311+
writer.write_all(&v.scale.to_be_bytes())?;
312+
Ok(10)
313+
}
307314
Self::Decimal128(v) => {
308315
writer.write_all(&[NUMBER_DECIMAL])?;
309316
writer.write_all(&v.value.to_be_bytes())?;
310-
writer.write_all(&v.precision.to_be_bytes())?;
311317
writer.write_all(&v.scale.to_be_bytes())?;
312-
Ok(19)
318+
Ok(18)
313319
}
314320
Self::Decimal256(v) => {
315321
writer.write_all(&[NUMBER_DECIMAL])?;
316322
writer.write_all(&v.value.to_be_bytes())?;
317-
writer.write_all(&v.precision.to_be_bytes())?;
318323
writer.write_all(&v.scale.to_be_bytes())?;
319-
Ok(35)
324+
Ok(34)
320325
}
321326
}
322327
}
@@ -353,26 +358,36 @@ impl Number {
353358
},
354359
NUMBER_FLOAT => Number::Float64(f64::from_be_bytes(bytes[1..].try_into().unwrap())),
355360
NUMBER_DECIMAL => match len {
361+
9 => {
362+
let value = i64::from_be_bytes(bytes[1..9].try_into().unwrap());
363+
let scale = u8::from_be_bytes(bytes[9..10].try_into().unwrap());
364+
let dec = Decimal64 { scale, value };
365+
Number::Decimal64(dec)
366+
}
367+
17 => {
368+
let value = i128::from_be_bytes(bytes[1..17].try_into().unwrap());
369+
let scale = u8::from_be_bytes(bytes[17..18].try_into().unwrap());
370+
let dec = Decimal128 { scale, value };
371+
Number::Decimal128(dec)
372+
}
356373
18 => {
374+
// Compatible with deprecated Decimal128 formats, including precision
357375
let value = i128::from_be_bytes(bytes[1..17].try_into().unwrap());
358-
let precision = u8::from_be_bytes(bytes[17..18].try_into().unwrap());
359376
let scale = u8::from_be_bytes(bytes[18..19].try_into().unwrap());
360-
let dec = Decimal128 {
361-
precision,
362-
scale,
363-
value,
364-
};
377+
let dec = Decimal128 { scale, value };
365378
Number::Decimal128(dec)
366379
}
380+
33 => {
381+
let value = i256::from_be_bytes(bytes[1..33].try_into().unwrap());
382+
let scale = u8::from_be_bytes(bytes[33..34].try_into().unwrap());
383+
let dec = Decimal256 { scale, value };
384+
Number::Decimal256(dec)
385+
}
367386
34 => {
387+
// Compatible with deprecated Decimal256 formats, including precision
368388
let value = i256::from_be_bytes(bytes[1..33].try_into().unwrap());
369-
let precision = u8::from_be_bytes(bytes[33..34].try_into().unwrap());
370389
let scale = u8::from_be_bytes(bytes[34..35].try_into().unwrap());
371-
let dec = Decimal256 {
372-
precision,
373-
scale,
374-
value,
375-
};
390+
let dec = Decimal256 { scale, value };
376391
Number::Decimal256(dec)
377392
}
378393
_ => {

src/extension.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,76 @@ const MONTHS_PER_YEAR: i32 = 12;
2929

3030
const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S%.6f";
3131

32+
/// Represents extended JSON value types that are not supported in standard JSON.
33+
///
34+
/// Standard JSON only supports strings, numbers, booleans, null, arrays, and objects.
35+
/// This enum provides additional data types commonly needed in database systems and
36+
/// other applications that require more specialized data representations.
3237
#[derive(Debug, Clone)]
3338
pub enum ExtensionValue<'a> {
39+
/// Binary data (byte array), allowing efficient storage of binary content
40+
/// that would otherwise require base64 encoding in standard JSON
3441
Binary(&'a [u8]),
42+
/// Calendar date without time component (year, month, day)
3543
Date(Date),
44+
/// Timestamp with microsecond precision but without timezone information
3645
Timestamp(Timestamp),
46+
/// Timestamp with microsecond precision and timezone offset information
3747
TimestampTz(TimestampTz),
48+
/// Time interval representation for duration calculations
3849
Interval(Interval),
3950
}
4051

52+
/// Represents a calendar date (year, month, day) without time component.
53+
///
54+
/// The value is stored as days since the Unix epoch (January 1, 1970).
55+
/// This allows for efficient date arithmetic and comparison operations.
56+
/// Standard JSON has no native date type and typically uses ISO 8601 strings.
4157
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
4258
pub struct Date {
59+
/// Days since Unix epoch (January 1, 1970)
60+
/// Positive values represent dates after the epoch, negative values represent dates before
4361
pub value: i32,
4462
}
4563

64+
/// Represents a timestamp (date and time) without timezone information.
65+
///
66+
/// The value is stored as microseconds since the Unix epoch (January 1, 1970 00:00:00 UTC).
67+
/// This provides microsecond precision for timestamp operations.
68+
/// Standard JSON has no native timestamp type and typically uses ISO 8601 strings.
4669
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
4770
pub struct Timestamp {
71+
/// Microseconds since Unix epoch (January 1, 1970 00:00:00 UTC)
4872
pub value: i64,
4973
}
5074

75+
/// Represents a timestamp with timezone information.
76+
///
77+
/// Combines a timestamp value with a timezone offset, allowing for
78+
/// timezone-aware datetime operations. The timestamp is stored in UTC,
79+
/// and the offset indicates the local timezone.
80+
/// Standard JSON has no native timezone-aware timestamp type.
5181
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
5282
pub struct TimestampTz {
83+
/// Timezone offset in hours from UTC
5384
pub offset: i8,
85+
/// Microseconds since Unix epoch (January 1, 1970 00:00:00 UTC)
5486
pub value: i64,
5587
}
5688

89+
/// Represents a time interval or duration.
90+
///
91+
/// This structure can represent complex time intervals with separate
92+
/// components for months, days, and microseconds, allowing for precise
93+
/// duration calculations that account for calendar irregularities.
94+
/// Standard JSON has no native interval/duration type.
5795
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
5896
pub struct Interval {
97+
/// Number of months in the interval
5998
pub months: i32,
99+
/// Number of days in the interval
60100
pub days: i32,
101+
/// Number of microseconds in the interval
61102
pub micros: i64,
62103
}
63104

src/from.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@ impl<'a> From<Value<'a>> for JsonValue {
197197
Number::Int64(v) => JsonValue::Number(v.into()),
198198
Number::UInt64(v) => JsonValue::Number(v.into()),
199199
Number::Float64(v) => JsonValue::Number(JsonNumber::from_f64(v).unwrap()),
200+
Number::Decimal64(v) => {
201+
JsonValue::Number(JsonNumber::from_f64(v.to_float64()).unwrap())
202+
}
200203
Number::Decimal128(v) => {
201204
JsonValue::Number(JsonNumber::from_f64(v.to_float64()).unwrap())
202205
}

src/functions/operator.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,27 +66,27 @@ impl RawJsonb<'_> {
6666
/// // Type checking
6767
/// let arr_jsonb = "[1, 2, 3]".parse::<OwnedJsonb>().unwrap();
6868
/// let raw_jsonb = arr_jsonb.as_raw();
69-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "array");
69+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "ARRAY");
7070
///
7171
/// let obj_jsonb = r#"{"a": 1}"#.parse::<OwnedJsonb>().unwrap();
7272
/// let raw_jsonb = obj_jsonb.as_raw();
73-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "object");
73+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "OBJECT");
7474
///
7575
/// let num_jsonb = "1".parse::<OwnedJsonb>().unwrap();
7676
/// let raw_jsonb = num_jsonb.as_raw();
77-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "number");
77+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "INTEGER");
7878
///
7979
/// let string_jsonb = r#""hello""#.parse::<OwnedJsonb>().unwrap();
8080
/// let raw_jsonb = string_jsonb.as_raw();
81-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "string");
81+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "STRING");
8282
///
8383
/// let bool_jsonb = "true".parse::<OwnedJsonb>().unwrap();
8484
/// let raw_jsonb = bool_jsonb.as_raw();
85-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "boolean");
85+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "BOOLEAN");
8686
///
8787
/// let null_jsonb = "null".parse::<OwnedJsonb>().unwrap();
8888
/// let raw_jsonb = null_jsonb.as_raw();
89-
/// assert_eq!(raw_jsonb.type_of().unwrap(), "null");
89+
/// assert_eq!(raw_jsonb.type_of().unwrap(), "NULL_VALUE");
9090
/// ```
9191
pub fn type_of(&self) -> Result<&'static str> {
9292
let jsonb_item_type = self.jsonb_item_type()?;
@@ -99,9 +99,11 @@ impl RawJsonb<'_> {
9999
JsonbItem::Number(data) => {
100100
let val = Number::decode(data)?;
101101
match val {
102-
Number::Decimal128(_v) => Ok(TYPE_DECIMAL),
103-
Number::Decimal256(_v) => Ok(TYPE_DECIMAL),
104-
_ => Ok(TYPE_NUMBER),
102+
Number::UInt64(_) | Number::Int64(_) => Ok(TYPE_INTEGER),
103+
Number::Decimal64(_)
104+
| Number::Decimal128(_)
105+
| Number::Decimal256(_) => Ok(TYPE_DECIMAL),
106+
Number::Float64(_) => Ok(TYPE_DOUBLE),
105107
}
106108
}
107109
_ => Err(Error::InvalidJsonb),

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ pub use extension::*;
8484
pub use from::*;
8585
pub use number::Decimal128;
8686
pub use number::Decimal256;
87+
pub use number::Decimal64;
8788
pub use number::Number;
8889
pub use owned::to_owned_jsonb;
8990
pub use owned::OwnedJsonb;

0 commit comments

Comments
 (0)