From 8d82534b70c463ac4f42fb2799d41bd6559c174e Mon Sep 17 00:00:00 2001 From: baishen Date: Mon, 30 Jun 2025 16:49:01 +0800 Subject: [PATCH 1/3] perf: Improve parse json performance --- .github/workflows/rust.yml | 2 + benches/parser.rs | 10 +- benches/strip_nulls.rs | 4 +- src/core/databend/ser.rs | 299 ++++++++++++---- src/error.rs | 6 +- src/lib.rs | 5 + src/number.rs | 384 ++++++++++++++++---- src/parser.rs | 680 +++++++++++++++++++++++++++++------- tests/it/encode.rs | 4 - tests/it/functions.rs | 26 +- tests/it/jsonpath_parser.rs | 6 +- tests/it/keypath_parser.rs | 6 +- tests/it/number.rs | 76 +++- tests/it/parser.rs | 126 +++++-- 14 files changed, 1289 insertions(+), 345 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5d09805..817853a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -24,3 +24,5 @@ jobs: run: cargo build --verbose - name: Run tests run: cargo test --verbose + - name: Run tests disable arbitrary_precision + run: cargo test --no-default-features --features databend --verbose diff --git a/benches/parser.rs b/benches/parser.rs index 46fe81a..6f9a010 100644 --- a/benches/parser.rs +++ b/benches/parser.rs @@ -18,7 +18,7 @@ use std::io::Read; use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; fn parse_jsonb(data: &[u8]) { - let _v: jsonb::Value = jsonb::parse_value(data).unwrap(); + let _v: jsonb::OwnedJsonb = jsonb::parse_owned_jsonb(data).unwrap(); } fn parse_serde_json(data: &[u8]) { @@ -46,20 +46,20 @@ fn add_benchmark(c: &mut Criterion) { let file = format!("{}", path.unwrap().path().display()); let bytes = read(&file); - c.bench_function(&format!("jsonb parse {}", file), |b| { + c.bench_function(&format!("jsonb parse {file}"), |b| { b.iter(|| parse_jsonb(&bytes)) }); - c.bench_function(&format!("serde_json parse {}", file), |b| { + c.bench_function(&format!("serde_json parse {file}"), |b| { b.iter(|| parse_serde_json(&bytes)) }); - c.bench_function(&format!("json_deserializer parse {}", file), |b| { + c.bench_function(&format!("json_deserializer parse {file}"), |b| { b.iter(|| parse_json_deserializer(&bytes)) }); let bytes = bytes.clone(); - c.bench_function(&format!("simd_json parse {}", file), move |b| { + c.bench_function(&format!("simd_json parse {file}"), move |b| { b.iter_batched( || bytes.clone(), |mut data| parse_simd_json(&mut data), diff --git a/benches/strip_nulls.rs b/benches/strip_nulls.rs index 3023a9d..f4e71ff 100644 --- a/benches/strip_nulls.rs +++ b/benches/strip_nulls.rs @@ -62,11 +62,11 @@ fn add_benchmark(c: &mut Criterion) { let bytes = read(&file); let json = from_slice(&bytes).unwrap().to_vec(); - c.bench_function(&format!("strip_nulls_deser[{}]", file), |b| { + c.bench_function(&format!("strip_nulls_deser[{file}]"), |b| { b.iter(|| strip_nulls_deser(&json)); }); - c.bench_function(&format!("strip_nulls_fast[{}]", file), |b| { + c.bench_function(&format!("strip_nulls_fast[{file}]"), |b| { b.iter(|| strip_nulls_fast(&json)); }); } diff --git a/src/core/databend/ser.rs b/src/core/databend/ser.rs index 5470c55..a2b3746 100644 --- a/src/core/databend/ser.rs +++ b/src/core/databend/ser.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::borrow::Cow; use std::collections::VecDeque; use byteorder::BigEndian; @@ -28,6 +29,7 @@ use crate::core::ObjectBuilder; use crate::error::*; use crate::extension::ExtensionValue; use crate::number::Number; +use crate::parser::JsonAst; use crate::value::Object; use crate::value::Value; use crate::Error; @@ -634,16 +636,92 @@ impl Serialize for RawJsonb<'_> { } } +/// `BaseEncoder` provides common buffer management functionality for `JSONB` encoding. +/// It handles low-level operations like reserving space for JEntries, writing encoded +/// JEntries back to the buffer, and encoding container headers. +struct BaseEncoder<'a> { + buf: &'a mut Vec, +} + +impl<'a> BaseEncoder<'a> { + /// Creates a new `BaseEncoder` with the given buffer. + fn new(buf: &'a mut Vec) -> BaseEncoder<'a> { + Self { buf } + } + + /// Reserves space in the buffer for JEntries that will be filled in later. + /// Returns the starting index where the JEntries will be placed. + fn reserve_jentries(&mut self, len: usize) -> usize { + let old_len = self.buf.len(); + let new_len = old_len + len; + self.buf.resize(new_len, 0); + old_len + } + + /// Writes an encoded `JEntry` to the buffer at the specified index. + /// Updates the index to point to the next `JEntry` position. + fn replace_jentry(&mut self, jentry: JEntry, jentry_index: &mut usize) { + let jentry_bytes = jentry.encoded().to_be_bytes(); + for (i, b) in jentry_bytes.iter().enumerate() { + self.buf[*jentry_index + i] = *b; + } + *jentry_index += 4; + } + + /// Encodes a scalar container header and reserves space for its `JEntry`. + /// Returns the total length of the scalar container and the index where its JEntry will be placed. + fn encode_scalar_header(&mut self) -> (usize, usize) { + let header = SCALAR_CONTAINER_TAG; + self.buf.write_u32::(header).unwrap(); + + // Scalar Value only has one JEntry + let scalar_len = 4 + 4; + let jentry_index = self.reserve_jentries(4); + (scalar_len, jentry_index) + } + + /// Encodes an array container header and reserves space for its JEntries. + /// Returns the total length of the array container and the index where its JEntries will be placed. + fn encode_array_header(&mut self, len: usize) -> (usize, usize) { + let header = ARRAY_CONTAINER_TAG | len as u32; + self.buf.write_u32::(header).unwrap(); + + // `Array` has N `JEntries` + let array_len = 4 + len * 4; + let jentry_index = self.reserve_jentries(len * 4); + + (array_len, jentry_index) + } + + /// Encodes an object container header and reserves space for its JEntries. + /// Returns the total length of the object container and the index where its JEntries will be placed. + fn encode_object_header(&mut self, len: usize) -> (usize, usize) { + let header = OBJECT_CONTAINER_TAG | len as u32; + self.buf.write_u32::(header).unwrap(); + + // `Object` has 2 * N `JEntries` + let object_len = 4 + len * 8; + let jentry_index = self.reserve_jentries(len * 8); + + (object_len, jentry_index) + } +} + +/// Encoder for serializing Value types to `JSONB` binary format. +/// Uses `BaseEncoder` for common buffer management operations. pub(crate) struct Encoder<'a> { - pub buf: &'a mut Vec, + base_encoder: BaseEncoder<'a>, } impl<'a> Encoder<'a> { + /// Creates a new `Encoder` with the given buffer. pub(crate) fn new(buf: &'a mut Vec) -> Encoder<'a> { - Self { buf } + let base_encoder = BaseEncoder::new(buf); + Self { base_encoder } } - // Encode `JSONB` Value to a sequence of bytes + /// Encodes a `Value` into `JSONB` binary format. + /// Dispatches to the appropriate encoding method based on the value type. pub(crate) fn encode(&mut self, value: &Value<'a>) { match value { Value::Array(array) => self.encode_array(array), @@ -652,93 +730,61 @@ impl<'a> Encoder<'a> { }; } - // Encoded `Scalar` consists of a `Header`, a `JEntry` and encoded data + /// Encodes a scalar `Value` (null, bool, number, string, or extension types). + /// Returns the total length of the encoded scalar. fn encode_scalar(&mut self, value: &Value<'a>) -> usize { - self.buf - .write_u32::(SCALAR_CONTAINER_TAG) - .unwrap(); - - // Scalar Value only has one JEntry - let mut scalar_len = 4 + 4; - let mut jentry_index = self.reserve_jentries(4); + let (mut scalar_len, mut jentry_index) = self.base_encoder.encode_scalar_header(); let jentry = self.encode_value(value); scalar_len += jentry.length as usize; - self.replace_jentry(jentry, &mut jentry_index); + self.base_encoder.replace_jentry(jentry, &mut jentry_index); scalar_len } - // Encoded `Array` consists of a `Header`, N `JEntries` and encoded data - // N is the number of `Array` inner values + /// Encodes an array of Values. + /// Returns the total length of the encoded array. fn encode_array(&mut self, values: &[Value<'a>]) -> usize { - let header = ARRAY_CONTAINER_TAG | values.len() as u32; - self.buf.write_u32::(header).unwrap(); - - // `Array` has N `JEntries` - let mut array_len = 4 + values.len() * 4; - let mut jentry_index = self.reserve_jentries(values.len() * 4); + let (mut array_len, mut jentry_index) = self.base_encoder.encode_array_header(values.len()); // encode all values for value in values.iter() { let jentry = self.encode_value(value); array_len += jentry.length as usize; - self.replace_jentry(jentry, &mut jentry_index); + self.base_encoder.replace_jentry(jentry, &mut jentry_index); } array_len } - // Encoded `Object` consists of a `Header`, 2 * N `JEntries` and encoded data - // N is the number of `Object` inner key value pair + /// Encodes an object of Values (map of string keys to Values). + /// Returns the total length of the encoded object. fn encode_object(&mut self, obj: &Object<'a>) -> usize { - let header = OBJECT_CONTAINER_TAG | obj.len() as u32; - self.buf.write_u32::(header).unwrap(); - - // `Object` has 2 * N `JEntries` - let mut object_len = 4 + obj.len() * 8; - let mut jentry_index = self.reserve_jentries(obj.len() * 8); + let (mut object_len, mut jentry_index) = self.base_encoder.encode_object_header(obj.len()); // encode all keys first for (key, _) in obj.iter() { let len = key.len(); object_len += len; - self.buf.extend_from_slice(key.as_bytes()); + self.base_encoder.buf.extend_from_slice(key.as_bytes()); let jentry = JEntry::make_string_jentry(len); - self.replace_jentry(jentry, &mut jentry_index); + self.base_encoder.replace_jentry(jentry, &mut jentry_index); } + // encode all values for (_, value) in obj.iter() { let jentry = self.encode_value(value); object_len += jentry.length as usize; - self.replace_jentry(jentry, &mut jentry_index); + self.base_encoder.replace_jentry(jentry, &mut jentry_index); } object_len } - // Reserve space for `JEntries` and fill them later - // As the length of each `Value` cannot be known until the `Value` encoded - fn reserve_jentries(&mut self, len: usize) -> usize { - let old_len = self.buf.len(); - let new_len = old_len + len; - self.buf.resize(new_len, 0); - old_len - } - - // Write encoded `JEntry` to the corresponding index - fn replace_jentry(&mut self, jentry: JEntry, jentry_index: &mut usize) { - let jentry_bytes = jentry.encoded().to_be_bytes(); - for (i, b) in jentry_bytes.iter().enumerate() { - self.buf[*jentry_index + i] = *b; - } - *jentry_index += 4; - } - - // `Null` and `Boolean` only has a `JEntry` - // `Number` and `String` has a `JEntry` and an encoded data - // `Array` and `Object` has a container `JEntry` and nested encoded data + /// Encodes a single `Value` and returns its `JEntry`. + /// The `JEntry` contains metadata about the encoded value. fn encode_value(&mut self, value: &Value<'a>) -> JEntry { + let old_off = self.base_encoder.buf.len(); let jentry = match value { Value::Null => JEntry::make_null_jentry(), Value::Bool(v) => { @@ -749,49 +795,45 @@ impl<'a> Encoder<'a> { } } Value::Number(v) => { - let old_off = self.buf.len(); - let _ = v.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = v.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_number_jentry(len) } Value::String(s) => { let len = s.len(); - self.buf.extend_from_slice(s.as_ref().as_bytes()); + self.base_encoder + .buf + .extend_from_slice(s.as_ref().as_bytes()); JEntry::make_string_jentry(len) } Value::Binary(v) => { - let old_off = self.buf.len(); let val = ExtensionValue::Binary(v); - let _ = val.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = val.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_extension_jentry(len) } Value::Date(v) => { - let old_off = self.buf.len(); let val = ExtensionValue::Date(v.clone()); - let _ = val.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = val.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_extension_jentry(len) } Value::Timestamp(v) => { - let old_off = self.buf.len(); let val = ExtensionValue::Timestamp(v.clone()); - let _ = val.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = val.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_extension_jentry(len) } Value::TimestampTz(v) => { - let old_off = self.buf.len(); let val = ExtensionValue::TimestampTz(v.clone()); - let _ = val.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = val.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_extension_jentry(len) } Value::Interval(v) => { - let old_off = self.buf.len(); let val = ExtensionValue::Interval(v.clone()); - let _ = val.compact_encode(&mut self.buf).unwrap(); - let len = self.buf.len() - old_off; + let _ = val.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; JEntry::make_extension_jentry(len) } Value::Array(array) => { @@ -807,3 +849,116 @@ impl<'a> Encoder<'a> { jentry } } + +/// `JsonAstEncoder` for serializing JsonAst types to `JSONB` binary format. +/// Similar to `Encoder` but works with `JsonAst` instead of `Value` types. +/// Uses `BaseEncoder` for common buffer management operations. +pub(crate) struct JsonAstEncoder<'a> { + base_encoder: BaseEncoder<'a>, +} + +impl<'a> JsonAstEncoder<'a> { + /// Creates a new `JsonAstEncoder` with the given buffer. + pub(crate) fn new(buf: &'a mut Vec) -> JsonAstEncoder<'a> { + let base_encoder = BaseEncoder::new(buf); + Self { base_encoder } + } + + /// Encodes a `JsonAst` into `JSONB` binary format. + /// Dispatches to the appropriate encoding method based on the value type. + pub(crate) fn encode(&mut self, value: &JsonAst<'a>) { + match value { + JsonAst::Array(array) => self.encode_array(array), + JsonAst::Object(obj) => self.encode_object(obj), + _ => self.encode_scalar(value), + }; + } + + /// Encodes a scalar JsonAst (null, bool, number, or string). + /// Returns the total length of the encoded scalar. + fn encode_scalar(&mut self, value: &JsonAst<'a>) -> usize { + let (mut scalar_len, mut jentry_index) = self.base_encoder.encode_scalar_header(); + + let jentry = self.encode_value(value); + scalar_len += jentry.length as usize; + self.base_encoder.replace_jentry(jentry, &mut jentry_index); + + scalar_len + } + + /// Encodes an array of `JsonAst` values. + /// Returns the total length of the encoded array. + fn encode_array(&mut self, values: &[JsonAst<'a>]) -> usize { + let (mut array_len, mut jentry_index) = self.base_encoder.encode_array_header(values.len()); + + // encode all values + for value in values.iter() { + let jentry = self.encode_value(value); + array_len += jentry.length as usize; + self.base_encoder.replace_jentry(jentry, &mut jentry_index); + } + + array_len + } + + /// Encodes an object of `JsonAst` values (vector of key-value pairs). + /// Returns the total length of the encoded object. + fn encode_object(&mut self, obj: &[(Cow<'a, str>, JsonAst<'a>, usize)]) -> usize { + let (mut object_len, mut jentry_index) = self.base_encoder.encode_object_header(obj.len()); + + // encode all keys first + for (key, _, _) in obj.iter() { + let len = key.len(); + object_len += len; + self.base_encoder.buf.extend_from_slice(key.as_bytes()); + let jentry = JEntry::make_string_jentry(len); + self.base_encoder.replace_jentry(jentry, &mut jentry_index); + } + // encode all values + for (_, value, _) in obj.iter() { + let jentry = self.encode_value(value); + object_len += jentry.length as usize; + self.base_encoder.replace_jentry(jentry, &mut jentry_index); + } + + object_len + } + + /// Encodes a single `JsonAst` value and returns its `JEntry`. + /// The `JEntry` contains metadata about the encoded value. + fn encode_value(&mut self, value: &JsonAst<'a>) -> JEntry { + let jentry = match value { + JsonAst::Null => JEntry::make_null_jentry(), + JsonAst::Bool(v) => { + if *v { + JEntry::make_true_jentry() + } else { + JEntry::make_false_jentry() + } + } + JsonAst::Number(v) => { + let old_off = self.base_encoder.buf.len(); + let _ = v.compact_encode(&mut self.base_encoder.buf).unwrap(); + let len = self.base_encoder.buf.len() - old_off; + JEntry::make_number_jentry(len) + } + JsonAst::String(s) => { + let len = s.len(); + self.base_encoder + .buf + .extend_from_slice(s.as_ref().as_bytes()); + JEntry::make_string_jentry(len) + } + JsonAst::Array(array) => { + let len = self.encode_array(array); + JEntry::make_container_jentry(len) + } + JsonAst::Object(obj) => { + let len = self.encode_object(obj); + JEntry::make_container_jentry(len) + } + }; + + jentry + } +} diff --git a/src/error.rs b/src/error.rs index 371ad0b..ce5f326 100644 --- a/src/error.rs +++ b/src/error.rs @@ -35,13 +35,14 @@ pub enum ParseErrorCode { InvalidLoneLeadingSurrogateInHexEscape(u16), InvalidSurrogateInHexEscape(u16), UnexpectedEndOfHexEscape, + ObjectDuplicateKey(String), } pub type Result = std::result::Result; impl Display for ParseErrorCode { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - match *self { + match self { ParseErrorCode::InvalidEOF => f.write_str("EOF while parsing a value"), ParseErrorCode::InvalidNumberValue => f.write_str("invalid number"), ParseErrorCode::InvalidStringValue => f.write_str("invalid string"), @@ -68,6 +69,9 @@ impl Display for ParseErrorCode { write!(f, "invalid surrogate in hex escape '{:X}'", n) } ParseErrorCode::UnexpectedEndOfHexEscape => f.write_str("unexpected end of hex escape"), + ParseErrorCode::ObjectDuplicateKey(key) => { + write!(f, "duplicate object attribute \"{}\"", key) + } } } } diff --git a/src/lib.rs b/src/lib.rs index cdc670c..021a236 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,7 +89,12 @@ pub use number::Number; pub use owned::to_owned_jsonb; pub use owned::OwnedJsonb; pub use parser::from_slice; +pub use parser::parse_owned_jsonb; +pub use parser::parse_owned_jsonb_standard_mode; +pub use parser::parse_owned_jsonb_standard_mode_with_buf; +pub use parser::parse_owned_jsonb_with_buf; pub use parser::parse_value; +pub use parser::parse_value_standard_mode; pub use raw::from_raw_jsonb; pub use raw::RawJsonb; pub use value::*; diff --git a/src/number.rs b/src/number.rs index c684cbf..ded0170 100644 --- a/src/number.rs +++ b/src/number.rs @@ -28,10 +28,97 @@ use serde::de::Deserialize; use serde::de::Deserializer; use serde::de::Visitor; use serde::ser::Serialize; -use serde::ser::SerializeStruct; use serde::ser::Serializer; -const NUMBER_TOKEN: &str = "$serde_json::private::Number"; +// Pre-calculate powers of 10 for common scales to avoid repeated computation +const I128_POWERS_OF_10: [i128; 39] = [ + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000, + 10000000000000000000, + 100000000000000000000, + 1000000000000000000000, + 10000000000000000000000, + 100000000000000000000000, + 1000000000000000000000000, + 10000000000000000000000000, + 100000000000000000000000000, + 1000000000000000000000000000, + 10000000000000000000000000000, + 100000000000000000000000000000, + 1000000000000000000000000000000, + 10000000000000000000000000000000, + 100000000000000000000000000000000, + 1000000000000000000000000000000000, + 10000000000000000000000000000000000, + 100000000000000000000000000000000000, + 1000000000000000000000000000000000000, + 10000000000000000000000000000000000000, + 100000000000000000000000000000000000000, +]; + +// Pre-calculate leading zeros to avoid repeated computation +const LEADING_ZEROS: [&str; 38] = [ + "", + "0", + "00", + "000", + "0000", + "00000", + "000000", + "0000000", + "00000000", + "000000000", + "0000000000", + "00000000000", + "000000000000", + "0000000000000", + "00000000000000", + "000000000000000", + "0000000000000000", + "00000000000000000", + "000000000000000000", + "0000000000000000000", + "00000000000000000000", + "000000000000000000000", + "0000000000000000000000", + "00000000000000000000000", + "000000000000000000000000", + "0000000000000000000000000", + "00000000000000000000000000", + "000000000000000000000000000", + "0000000000000000000000000000", + "00000000000000000000000000000", + "000000000000000000000000000000", + "0000000000000000000000000000000", + "00000000000000000000000000000000", + "000000000000000000000000000000000", + "0000000000000000000000000000000000", + "00000000000000000000000000000000000", + "000000000000000000000000000000000000", + "0000000000000000000000000000000000000", +]; + +const I128_SCALE: usize = 38; + +static I256_DIVIDE_SCALE: std::sync::LazyLock = + std::sync::LazyLock::new(|| i256::from(100000000000000000000000000000000000000_i128)); /// Represents a decimal number with 64-bit precision. /// @@ -184,6 +271,9 @@ impl Serialize for Number { /// /// This implementation supports serialization to JSON integers and floats. /// It automatically selects the most suitable output format based on the internal representation. + /// + /// When the `arbitrary_precision` feature is enabled, decimal types are serialized with full precision + /// using the optimized formatting functions. When disabled, decimal types are converted to f64. fn serialize(&self, serializer: S) -> std::result::Result where S: Serializer, @@ -192,12 +282,71 @@ impl Serialize for Number { Number::Int64(v) => serializer.serialize_i64(*v), Number::UInt64(v) => serializer.serialize_u64(*v), Number::Float64(v) => serializer.serialize_f64(*v), + #[cfg(feature = "arbitrary_precision")] Number::Decimal64(_) | Number::Decimal128(_) | Number::Decimal256(_) => { - let mut serialize_struct = serializer.serialize_struct(NUMBER_TOKEN, 0)?; - let val = format!("{}", self); - serialize_struct.serialize_field(NUMBER_TOKEN, val.as_str())?; + use serde::ser::SerializeStruct; + use std::io::Write; + const NUMBER_TOKEN: &str = "$serde_json::private::Number"; + + struct WriteAdapter<'a>(&'a mut std::io::Cursor<&'a mut [u8]>); + + impl std::fmt::Write for WriteAdapter<'_> { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.0.write_all(s.as_bytes()).map_err(|_| std::fmt::Error) + } + } + + impl WriteAdapter<'_> { + fn position(&self) -> usize { + self.0.position() as usize + } + } + + let mut buffer = [0u8; 128]; + let mut cursor = std::io::Cursor::new(&mut buffer[..]); + let mut adapter = WriteAdapter(&mut cursor); + + match self { + Number::Decimal64(v) => { + format_decimal_i128(&mut adapter, v.value as i128, v.scale as usize) + .map_err(|e| { + serde::ser::Error::custom(format!("Format decimal64 error: {e}")) + })?; + } + Number::Decimal128(v) => { + format_decimal_i128(&mut adapter, v.value, v.scale as usize).map_err( + |e| serde::ser::Error::custom(format!("Format decimal128 error: {e}")), + )?; + } + Number::Decimal256(v) => { + format_decimal_i256(&mut adapter, v.value, v.scale as usize).map_err( + |e| serde::ser::Error::custom(format!("Format decimal256 error: {e}")), + )?; + } + _ => unreachable!(), + } + + let pos = adapter.position(); + let num_str = std::str::from_utf8(&buffer[..pos]).map_err(|e| { + serde::ser::Error::custom(format!("Invalid decimal number: {e}")) + })?; + + let mut serialize_struct = serializer.serialize_struct(NUMBER_TOKEN, 1)?; + serialize_struct.serialize_field(NUMBER_TOKEN, num_str)?; serialize_struct.end() } + #[cfg(not(feature = "arbitrary_precision"))] + Number::Decimal64(_) | Number::Decimal128(_) | Number::Decimal256(_) => { + // Convert to f64 when arbitrary_precision is not enabled + let (value, scale) = match self { + Number::Decimal64(v) => (v.value as f64, v.scale as i32), + Number::Decimal128(v) => (v.value as f64, v.scale as i32), + Number::Decimal256(v) => (v.value.as_f64(), v.scale as i32), + _ => unreachable!(), + }; + let scaled_value = value / 10f64.powi(scale); + serializer.serialize_f64(scaled_value) + } } } } @@ -915,80 +1064,165 @@ impl Display for Number { let s = buffer.format(*v); write!(f, "{}", s) } - Number::Decimal64(v) => { - if v.scale == 0 { - write!(f, "{}", v.value) - } else { - let pow_scale = 10_i64.pow(v.scale as u32); - if v.value >= 0 { - write!( - f, - "{}.{:0>width$}", - v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } else { - write!( - f, - "-{}.{:0>width$}", - -v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } - } + Number::Decimal64(v) => format_decimal_i128(f, v.value as i128, v.scale as usize), + Number::Decimal128(v) => format_decimal_i128(f, v.value, v.scale as usize), + Number::Decimal256(v) => format_decimal_i256(f, v.value, v.scale as usize), + } + } +} + +/// Helper function to format a decimal i128 value to a formatter without string allocations +/// +/// This function efficiently formats a decimal number with the following optimizations: +/// 1. Uses stack-allocated buffers instead of heap allocations +/// 2. Handles the sign separately to simplify the formatting logic +/// 3. Uses the fast itoa library for integer-to-string conversion +/// 4. Pre-computed zero strings for padding fractional parts +fn format_decimal_i128( + f: &mut impl std::fmt::Write, + value: i128, + scale: usize, +) -> std::fmt::Result { + let mut itoa_buf = itoa::Buffer::new(); + if scale == 0 { + f.write_str(itoa_buf.format(value)) + } else { + // Handle negative numbers by writing the minus sign and working with absolute value + let value = if value < 0 { + f.write_str("-")?; + -value + } else { + value + }; + let pow_scale = I128_POWERS_OF_10[scale]; + // Split the value into integer and fractional parts + let integer_part = value / pow_scale; + f.write_str(itoa_buf.format(integer_part))?; + f.write_str(".")?; + + // Format the fractional part with leading zeros if needed + let fractional_part = (value % pow_scale).abs(); + let fractional_str = itoa_buf.format(fractional_part); + + let leading_zeros_count = scale - fractional_str.len(); + if leading_zeros_count > 0 { + let zeros = LEADING_ZEROS[leading_zeros_count]; + f.write_str(zeros)?; + } + f.write_str(fractional_str) + } +} + +/// Formats a decimal i256 value to a formatter without heap allocations. +/// +/// This function efficiently formats a 256-bit decimal number with the specified scale +/// (number of decimal places) by splitting it into high and low 128-bit parts. +fn format_decimal_i256( + f: &mut impl std::fmt::Write, + value: i256, + scale: usize, +) -> std::fmt::Result { + // Handle negative values by writing the minus sign and negating the value + let value = if value < i256::ZERO { + f.write_str("-")?; + -value + } else { + value + }; + + // Split the i256 value into high and low parts for easier formatting + let high_part = (value / *I256_DIVIDE_SCALE).as_i128(); + let low_part = (value % *I256_DIVIDE_SCALE).as_i128(); + let mut itoa_buf = itoa::Buffer::new(); + + // Case 1: Integer-only formatting (no decimal places) + if scale == 0 { + if high_part > 0 { + // Format high part first (most significant digits) + f.write_str(itoa_buf.format(high_part))?; + + // Format low part with proper zero padding to maintain place value + let low_str = itoa_buf.format(low_part); + let zeros_count = I128_SCALE - low_str.len(); + if zeros_count > 0 { + let zeros = LEADING_ZEROS[zeros_count]; + f.write_str(zeros)?; } - Number::Decimal128(v) => { - if v.scale == 0 { - write!(f, "{}", v.value) - } else { - let pow_scale = 10_i128.pow(v.scale as u32); - if v.value >= 0 { - write!( - f, - "{}.{:0>width$}", - v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } else { - write!( - f, - "-{}.{:0>width$}", - -v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } - } + f.write_str(low_str) + } else { + // Only low part has non-zero value + f.write_str(itoa_buf.format(low_part)) + } + } + // Case 2: Decimal point falls within the high part (large scale) + else if scale >= I128_SCALE { + // Calculate how many decimal places are in the high part + let high_scale = scale - I128_SCALE; + let pow_scale = I128_POWERS_OF_10[high_scale]; + + // Format the integer portion from the high part + let int_part = high_part / pow_scale; + f.write_str(itoa_buf.format(int_part))?; + f.write_str(".")?; + + // Format the fractional portion from the high part + if high_scale > 0 { + let high_frac_part = high_part % pow_scale; + let high_frac_str = itoa_buf.format(high_frac_part); + + // Add leading zeros if needed + let high_zeros_count = high_scale - high_frac_str.len(); + if high_zeros_count > 0 { + let zeros = LEADING_ZEROS[high_zeros_count]; + f.write_str(zeros)?; } - Number::Decimal256(v) => { - if v.scale == 0 { - write!(f, "{}", v.value) - } else { - let pow_scale = i256::from(10).pow(v.scale as u32); - // -1/10 = 0 - if v.value >= i256::from(0) { - write!( - f, - "{}.{:0>width$}", - v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } else { - write!( - f, - "-{}.{:0>width$}", - -v.value / pow_scale, - (v.value % pow_scale).abs(), - width = v.scale as usize - ) - } - } + f.write_str(high_frac_str)?; + } + + // Format the low part with proper zero padding + let mut low_buf = itoa::Buffer::new(); + let low_frac_str = low_buf.format(low_part); + let low_zeros_count = I128_SCALE - low_frac_str.len(); + if low_zeros_count > 0 { + let low_zeros = LEADING_ZEROS[low_zeros_count]; + f.write_str(low_zeros)?; + } + f.write_str(low_frac_str) + } + // Case 3: Decimal point falls within the low part + else { + // Format high part if it exists (integer portion) + if high_part > 0 { + f.write_str(itoa_buf.format(high_part))?; + } + let pow_scale = I128_POWERS_OF_10[scale]; + + // Calculate integer part from low component + let int_part = low_part / pow_scale; + let int_str = itoa_buf.format(int_part); + + // If high part exists, we need to ensure proper place value with padding + if high_part > 0 { + let int_zeros_count = I128_SCALE - scale - int_str.len(); + if int_zeros_count > 0 { + let int_zeros = LEADING_ZEROS[int_zeros_count]; + f.write_str(int_zeros)?; } } + f.write_str(int_str)?; + f.write_str(".")?; + + // Format fractional part from low component with proper zero padding + let frac_part = low_part % pow_scale; + let mut frac_buf = itoa::Buffer::new(); + let frac_str = frac_buf.format(frac_part); + + let frac_zeros_count = scale - frac_str.len(); + if frac_zeros_count > 0 { + let frac_zeros = LEADING_ZEROS[frac_zeros_count]; + f.write_str(frac_zeros)?; + } + f.write_str(frac_str) } } diff --git a/src/parser.rs b/src/parser.rs index e1d033a..b6bc9f9 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -24,13 +24,18 @@ use super::value::Object; use super::value::Value; use crate::core::Decoder; -use std::str::FromStr; - +use crate::core::JsonAstEncoder; +#[cfg(feature = "arbitrary_precision")] use crate::Decimal128; +#[cfg(feature = "arbitrary_precision")] use crate::Decimal256; +#[cfg(feature = "arbitrary_precision")] use crate::Decimal64; +use crate::OwnedJsonb; +#[cfg(feature = "arbitrary_precision")] use ethnum::i256; +#[cfg(feature = "arbitrary_precision")] const MAX_DECIMAL64_PRECISION: usize = 18; const MAX_DECIMAL128_PRECISION: usize = 38; const MAX_DECIMAL256_PRECISION: usize = 76; @@ -39,11 +44,161 @@ const UINT64_MIN: i128 = 0i128; const UINT64_MAX: i128 = 18_446_744_073_709_551_615i128; const INT64_MIN: i128 = -9_223_372_036_854_775_808i128; const INT64_MAX: i128 = 9_223_372_036_854_775_807i128; +#[cfg(feature = "arbitrary_precision")] const DECIMAL64_MIN: i128 = -999999999999999999i128; +#[cfg(feature = "arbitrary_precision")] const DECIMAL64_MAX: i128 = 999999999999999999i128; +#[cfg(feature = "arbitrary_precision")] const DECIMAL128_MIN: i128 = -99999999999999999999999999999999999999i128; +#[cfg(feature = "arbitrary_precision")] const DECIMAL128_MAX: i128 = 99999999999999999999999999999999999999i128; +#[cfg(feature = "arbitrary_precision")] +static POWER_TABLE: std::sync::LazyLock<[i256; 39]> = std::sync::LazyLock::new(|| { + [ + i256::from(1_i128), + i256::from(10_i128), + i256::from(100_i128), + i256::from(1000_i128), + i256::from(10000_i128), + i256::from(100000_i128), + i256::from(1000000_i128), + i256::from(10000000_i128), + i256::from(100000000_i128), + i256::from(1000000000_i128), + i256::from(10000000000_i128), + i256::from(100000000000_i128), + i256::from(1000000000000_i128), + i256::from(10000000000000_i128), + i256::from(100000000000000_i128), + i256::from(1000000000000000_i128), + i256::from(10000000000000000_i128), + i256::from(100000000000000000_i128), + i256::from(1000000000000000000_i128), + i256::from(10000000000000000000_i128), + i256::from(100000000000000000000_i128), + i256::from(1000000000000000000000_i128), + i256::from(10000000000000000000000_i128), + i256::from(100000000000000000000000_i128), + i256::from(1000000000000000000000000_i128), + i256::from(10000000000000000000000000_i128), + i256::from(100000000000000000000000000_i128), + i256::from(1000000000000000000000000000_i128), + i256::from(10000000000000000000000000000_i128), + i256::from(100000000000000000000000000000_i128), + i256::from(1000000000000000000000000000000_i128), + i256::from(10000000000000000000000000000000_i128), + i256::from(100000000000000000000000000000000_i128), + i256::from(1000000000000000000000000000000000_i128), + i256::from(10000000000000000000000000000000000_i128), + i256::from(100000000000000000000000000000000000_i128), + i256::from(1000000000000000000000000000000000000_i128), + i256::from(10000000000000000000000000000000000000_i128), + i256::from(100000000000000000000000000000000000000_i128), + ] +}); + +/// Intermediate Abstract Syntax Tree representation of JSON values optimized for parsing performance. +/// +/// `JsonAst` serves as an efficient intermediate representation during the JSON parsing process, +/// providing several performance optimizations: +/// +/// 1. **Zero-copy string handling**: Uses `Cow<'a, str>` to avoid unnecessary string allocations +/// when the input can be directly borrowed, for both string values and object keys. +/// +/// 2. **Efficient object representation**: Stores object entries as a vector of tuples with +/// keys using the same zero-copy `Cow<'a, str>` approach, reducing memory overhead and +/// avoiding unnecessary allocations during parsing. +/// +/// 3. **Immediate validation**: Performs object key uniqueness validation and sorting during +/// the parsing process, eliminating the need for a separate validation pass and reducing +/// overall processing time. +/// +/// 4. **Lifetime preservation**: Maintains the lifetime of the original input buffer throughout +/// the parsing process, minimizing unnecessary copying of data. +/// +/// 5. **Direct conversion path**: Provides an optimized conversion to the final `OwnedJsonb` type +/// through the `into_owned_jsonb` method. +/// +/// This approach separates the parsing concerns from the final representation concerns, +/// allowing each to be optimized independently. +#[derive(Clone, PartialEq, Default, Eq)] +pub(crate) enum JsonAst<'a> { + #[default] + Null, + Bool(bool), + String(Cow<'a, str>), + Number(Number), + Array(Vec>), + Object(Vec<(Cow<'a, str>, JsonAst<'a>, usize)>), +} + +impl<'a> JsonAst<'a> { + fn as_string(&self) -> Option> { + match self { + JsonAst::String(s) => Some(s.clone()), + _ => None, + } + } + + /// Converts the intermediate `JsonAst` representation to the final `Value` type. + fn into_value(self) -> Result> { + let value = match self { + JsonAst::Null => Value::Null, + JsonAst::Bool(v) => Value::Bool(v), + JsonAst::String(v) => Value::String(v), + JsonAst::Number(v) => Value::Number(v), + JsonAst::Array(vals) => { + let mut values = Vec::with_capacity(vals.len()); + for val in vals.into_iter() { + let value = val.into_value()?; + values.push(value); + } + Value::Array(values) + } + JsonAst::Object(kvs) => { + let mut object = Object::new(); + for (key, val, pos) in kvs.into_iter() { + let key_str = key.to_string(); + if object.contains_key(&key_str) { + let code = ParseErrorCode::ObjectDuplicateKey(key_str); + return Err(Error::Syntax(code, pos)); + } + let value = val.into_value()?; + object.insert(key_str, value); + } + Value::Object(object) + } + }; + Ok(value) + } + + /// Converts the `JsonAst` to an owned JSONB representation. + /// + /// This method optimizes the conversion process by: + /// + /// 1. Pre-calculating the required buffer size to avoid reallocations + /// 2. Using a specialized encoder (JsonAstEncoder) that understands the JsonAst structure + /// 3. Directly encoding from the parsing-optimized representation without + /// first converting to the intermediate `Value` type + /// 4. Preserving the performance benefits of the sorted keys and compact representation + /// + /// Returns a `OwnedJsonb` containing the binary JSONB representation. + fn into_owned_jsonb(self, size: usize) -> OwnedJsonb { + let mut buf = Vec::with_capacity(size); + let mut encoder = JsonAstEncoder::new(&mut buf); + encoder.encode(&self); + OwnedJsonb::new(buf) + } + + /// Converts the `JsonAst` to an owned JSONB representation with result buffer. + fn into_owned_jsonb_with_buffer(self, size: usize, result_buf: &mut Vec) { + result_buf.reserve(size); + let mut encoder = JsonAstEncoder::new(result_buf); + encoder.encode(&self); + } +} + /// The binary `JSONB` contains three parts, `Header`, `JEntry` and `RawData`. /// This structure can be nested. Each group of structures starts with a `Header`. /// The upper-level `Value` will store the `Header` length or offset of @@ -80,25 +235,104 @@ pub fn from_slice(buf: &[u8]) -> Result> { } } -// Parse JSON text to JSONB Value. -// Inspired by `https://github.com/jorgecarleitao/json-deserializer` -// Thanks Jorge Leitao. +/// Parse JSON text to JSONB Value with extended mode. +/// The parser will follow extended JSON syntax rules like leading plus signs, +/// multiple leading zeros, decimal points without digits, and empty array elements. +/// Numeric values are preferentially parsed as decimal values to ensure that precision is not lost. +/// +/// Inspired by `https://github.com/jorgecarleitao/json-deserializer` +/// Thanks Jorge Leitao. pub fn parse_value(buf: &[u8]) -> Result> { let mut parser = Parser::new(buf); - parser.parse() + let json_ast = parser.parse()?; + json_ast.into_value() +} + +/// Parse JSON text to JSONB Value with standard mode. +/// The parser will follow standard JSON syntax rules. +pub fn parse_value_standard_mode(buf: &[u8]) -> Result> { + let mut parser = Parser::new_standard_mode(buf); + let json_ast = parser.parse()?; + json_ast.into_value() +} + +/// Parses JSON text into an owned JSONB binary representation. +/// The parser will follow extended JSON syntax rules. +pub fn parse_owned_jsonb(buf: &[u8]) -> Result { + let size = buf.len(); + let mut parser = Parser::new(buf); + let json_ast = parser.parse()?; + Ok(json_ast.into_owned_jsonb(size)) +} + +/// Parses JSON text into an owned JSONB binary representation using standard JSON syntax rules. +/// The parser will follow standard JSON syntax rules. +pub fn parse_owned_jsonb_standard_mode(buf: &[u8]) -> Result { + let size = buf.len(); + let mut parser = Parser::new_standard_mode(buf); + let json_ast = parser.parse()?; + Ok(json_ast.into_owned_jsonb(size)) +} + +/// Parses JSON text into a provided buffer as JSONB binary representation. +/// The parser will follow extended JSON syntax rules. +pub fn parse_owned_jsonb_with_buf(buf: &[u8], result_buf: &mut Vec) -> Result<()> { + let size = buf.len(); + let mut parser = Parser::new(buf); + let json_ast = parser.parse()?; + json_ast.into_owned_jsonb_with_buffer(size, result_buf); + Ok(()) } +/// Parses JSON text into a provided buffer as JSONB binary representation using standard JSON syntax rules. +/// The parser will follow standard JSON syntax rules. +pub fn parse_owned_jsonb_standard_mode_with_buf( + buf: &[u8], + result_buf: &mut Vec, +) -> Result<()> { + let size = buf.len(); + let mut parser = Parser::new_standard_mode(buf); + let json_ast = parser.parse()?; + json_ast.into_owned_jsonb_with_buffer(size, result_buf); + Ok(()) +} + +/// JSON parser with optimized parsing strategies. +/// +/// This parser implements both standard JSON parsing and an extended syntax with additional features. +/// It uses a single-pass approach for better performance and provides detailed error reporting. struct Parser<'a> { + /// Input buffer containing the JSON text to parse buf: &'a [u8], + /// Current position in the buffer idx: usize, + /// Function pointer for parsing numbers based on the mode + parse_number_fn: fn(&mut Self) -> Result>, + /// Function pointer for parsing arrays based on the mode + parse_array_fn: fn(&mut Self) -> Result>, } impl<'a> Parser<'a> { - fn new(buf: &'a [u8]) -> Parser<'a> { - Self { buf, idx: 0 } + fn new(buf: &'a [u8]) -> Self { + Self { + buf, + idx: 0, + parse_number_fn: Self::parse_json_number, + parse_array_fn: Self::parse_json_array, + } + } + + fn new_standard_mode(buf: &'a [u8]) -> Self { + Self { + buf, + idx: 0, + parse_number_fn: Self::parse_standard_json_number, + parse_array_fn: Self::parse_standard_json_array, + } } - fn parse(&mut self) -> Result> { + /// Parse a complete JSON document from the input buffer. + fn parse(&mut self) -> Result> { let val = self.parse_json_value()?; self.skip_unused(); if self.idx < self.buf.len() { @@ -108,16 +342,21 @@ impl<'a> Parser<'a> { Ok(val) } - fn parse_json_value(&mut self) -> Result> { + /// Parse a JSON value, dispatching to the appropriate parser based on the first character. + /// + /// This is an optimized version that avoids runtime mode checks by using function pointers + /// selected during parser initialization. + #[inline] + fn parse_json_value(&mut self) -> Result> { self.skip_unused(); let c = self.next()?; match c { b'n' => self.parse_json_null(), b't' => self.parse_json_true(), b'f' => self.parse_json_false(), - b'0'..=b'9' | b'-' | b'+' | b'.' => self.parse_json_number(), + b'0'..=b'9' | b'-' | b'+' | b'.' => (self.parse_number_fn)(self), b'"' => self.parse_json_string(), - b'[' => self.parse_json_array(), + b'[' => (self.parse_array_fn)(self), b'{' => self.parse_json_object(), _ => { self.step(); @@ -248,43 +487,127 @@ impl<'a> Parser<'a> { } } - fn parse_json_null(&mut self) -> Result> { + fn parse_json_null(&mut self) -> Result> { let data = [b'n', b'u', b'l', b'l']; for v in data.into_iter() { self.must_is(v)?; } - Ok(Value::Null) + Ok(JsonAst::Null) } - fn parse_json_true(&mut self) -> Result> { + fn parse_json_true(&mut self) -> Result> { let data = [b't', b'r', b'u', b'e']; for v in data.into_iter() { self.must_is(v)?; } - Ok(Value::Bool(true)) + Ok(JsonAst::Bool(true)) } - fn parse_json_false(&mut self) -> Result> { + fn parse_json_false(&mut self) -> Result> { let data = [b'f', b'a', b'l', b's', b'e']; for v in data.into_iter() { self.must_is(v)?; } - Ok(Value::Bool(false)) + Ok(JsonAst::Bool(false)) + } + + /// Parse JSON numbers in standard mode + /// + /// This function implements strict parsing according to the standard JSON specification: + /// 1. No leading plus sign (e.g., `+123`) + /// 2. No multiple leading zeros (e.g., `000123`) + /// 3. Decimal point must have digits on both sides (no `.123` or `123.`) + /// 4. Exponent part must have digits + /// + /// Parsing strategy: + /// 1. First try to parse as integer (i64/u64) + /// 2. If it contains decimal point or exponent, parse as floating point (f64) + fn parse_standard_json_number(&mut self) -> Result> { + let start_idx = self.idx; + + let mut negative = false; + let mut has_fraction = false; + let mut has_exponent = false; + + let c = self.next()?; + if *c == b'-' { + negative = true; + self.step(); + } else if *c == b'+' || *c == b'.' { + self.step(); + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + if self.check_next(b'0') { + self.step(); + if self.check_digit() { + self.step(); + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + } else { + let len = self.step_digits(); + if len == 0 { + if !negative { + self.step(); + } + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + } + if self.check_next(b'.') { + has_fraction = true; + self.step(); + let len = self.step_digits(); + if len == 0 { + self.step(); + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + } + if self.check_next_either(b'E', b'e') { + has_exponent = true; + self.step(); + if self.check_next_either(b'+', b'-') { + self.step(); + } + let len = self.step_digits(); + if len == 0 { + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + } + let s = unsafe { std::str::from_utf8_unchecked(&self.buf[start_idx..self.idx]) }; + + if !has_fraction && !has_exponent { + if !negative { + if let Ok(v) = s.parse::() { + return Ok(JsonAst::Number(Number::UInt64(v))); + } + } else if let Ok(v) = s.parse::() { + return Ok(JsonAst::Number(Number::Int64(v))); + } + } + + match fast_float2::parse(s) { + Ok(v) => Ok(JsonAst::Number(Number::Float64(v))), + Err(_) => Err(self.error(ParseErrorCode::InvalidNumberValue)), + } } - /// Parse a JSON number using a single-pass approach with multiple fallback strategies. + /// Parse extended JSON numbers (supporting non-standard syntax) + /// + /// This function implements a high-performance JSON number parsing algorithm with extended syntax: + /// 1. Support for leading plus sign (e.g., `+123`) + /// 2. Support for multiple leading zeros (e.g., `000123`) + /// 3. Support for decimal point without digits on either side (e.g., `.123` or `123.`) /// - /// This function implements a high-performance JSON number parsing algorithm that: - /// 1. First attempts to parse the number as an i128 (for Decimal128/Int64/UInt64) - /// 2. Falls back to i256 (for Decimal256) if precision exceeds i128 capacity - /// 3. Finally falls back to Float64 if all other methods fail + /// Zero-allocation parsing strategy: + /// 1. Uses direct digit accumulation without intermediate string conversions + /// 2. For standard numeric types (Int64/UInt64), directly builds the value during parsing + /// 3. For decimal types, tracks scale and precision during the single-pass parse + /// 4. Falls back to Float64 parsing only when necessary /// - /// Extended JSON number syntax support: - /// - Leading plus sign (e.g., +123) which standard JSON doesn't allow - /// - Multiple leading zeros (e.g., 000123) which standard JSON doesn't allow - /// - Decimal point without preceding digits (e.g., .123) which standard JSON requires at least one digit before decimal - /// - Decimal point without any digits (e.g., 123.) which standard JSON requires at least one digit after decimal - fn parse_json_number(&mut self) -> Result> { + /// This implementation prioritizes performance through: + /// - Single-pass approach with minimal branching + /// - Avoiding heap allocations and string conversions + /// - Optimized handling of common number formats + fn parse_json_number(&mut self) -> Result> { // Store the starting position for potential fallback parsing let start_idx = self.idx; let mut negative = false; @@ -310,32 +633,35 @@ impl<'a> Parser<'a> { } } - // Mark the position where actual digits start (after sign and leading zeros) - let num_start_idx = self.idx; - // Initialize parsing state - let mut value = 0_i128; // Accumulates the numeric value + let mut hi_value = 0_i128; // Stores high digits (for large values) + let mut lo_value = 0_i128; // Stores low digits (for very large values) let mut scale = 0_u32; // Tracks decimal places - let mut fraction_offset = None; // Position of decimal point, if any - let mut has_exponent = false; // Whether the number has an exponent part let mut precision = 0; // Count of significant digits + let mut has_fraction = false; // Whether the number has an fraction part + let mut has_exponent = false; // Whether the number has an exponent part - // First parsing strategy: Try to parse as i128 with precision limit - while precision < MAX_DECIMAL128_PRECISION { + // Parse digits, supporting up to MAX_DECIMAL256_PRECISION digits + while precision < MAX_DECIMAL256_PRECISION { if self.check_digit() { // Parse digit and accumulate value let digit = (self.buf[self.idx] - b'0') as i128; - // Use unchecked operations for performance (we control precision limits) - value = unsafe { value.unchecked_mul(10_i128) }; - value = unsafe { value.unchecked_add(digit) }; + // Store in hi_value or lo_value based on precision + if precision < MAX_DECIMAL128_PRECISION { + hi_value = unsafe { hi_value.unchecked_mul(10_i128) }; + hi_value = unsafe { hi_value.unchecked_add(digit) }; + } else { + lo_value = unsafe { lo_value.unchecked_mul(10_i128) }; + lo_value = unsafe { lo_value.unchecked_add(digit) }; + } self.step(); } else if self.check_next(b'.') { // Handle decimal point - can only appear once - if fraction_offset.is_some() { + if has_fraction { return Err(self.error(ParseErrorCode::InvalidNumberValue)); } - fraction_offset = Some(self.idx); + has_fraction = true; self.step(); // Continue to next iteration without incrementing precision continue; @@ -345,24 +671,24 @@ impl<'a> Parser<'a> { } precision += 1; // Track scale (number of digits after decimal point) - if fraction_offset.is_some() { + if has_fraction { scale += 1; } } - // Handle numbers that exceed MAX_DECIMAL128_PRECISION - if precision == MAX_DECIMAL128_PRECISION { + // Handle numbers that exceed MAX_DECIMAL256_PRECISION + if precision == MAX_DECIMAL256_PRECISION { // If we haven't seen a decimal point yet, continue parsing integer part - if fraction_offset.is_none() { + if !has_fraction { let len = self.step_digits(); precision += len; if self.check_next(b'.') { - fraction_offset = Some(self.idx); + has_fraction = true; self.step(); } } // Parse fractional part if decimal point exists - if fraction_offset.is_some() { + if has_fraction { let len = self.step_digits(); precision += len; scale += len as u32; @@ -371,7 +697,7 @@ impl<'a> Parser<'a> { // Handle empty precision if !leading_zeros && precision == 0 { - return Err(self.error(ParseErrorCode::ExpectedSomeValue)); + return Err(self.error(ParseErrorCode::InvalidNumberValue)); } // Handle exponent notation (e.g., 1e10, 1.5E-7) if self.check_next_either(b'E', b'e') { @@ -390,68 +716,57 @@ impl<'a> Parser<'a> { // If no exponent and precision is within limits, try to return the most appropriate numeric type if !has_exponent && precision <= MAX_DECIMAL128_PRECISION { - // Apply sign - if negative { - value = value.checked_neg().unwrap(); - } + let value = if negative { -hi_value } else { hi_value }; // Try to fit the value into the most appropriate numeric type if scale == 0 && (UINT64_MIN..=UINT64_MAX).contains(&value) { - return Ok(Value::Number(Number::UInt64(u64::try_from(value).unwrap()))); + return Ok(JsonAst::Number(Number::UInt64( + u64::try_from(value).unwrap(), + ))); } else if scale == 0 && (INT64_MIN..=INT64_MAX).contains(&value) { - return Ok(Value::Number(Number::Int64(i64::try_from(value).unwrap()))); - } else if (DECIMAL64_MIN..=DECIMAL64_MAX).contains(&value) - && precision <= MAX_DECIMAL64_PRECISION + return Ok(JsonAst::Number(Number::Int64( + i64::try_from(value).unwrap(), + ))); + } + #[cfg(feature = "arbitrary_precision")] { - return Ok(Value::Number(Number::Decimal64(Decimal64 { - scale: scale as u8, - value: i64::try_from(value).unwrap(), - }))); - } else if (DECIMAL128_MIN..=DECIMAL128_MAX).contains(&value) { - return Ok(Value::Number(Number::Decimal128(Decimal128 { - scale: scale as u8, - value, - }))); + if (DECIMAL64_MIN..=DECIMAL64_MAX).contains(&value) + && precision <= MAX_DECIMAL64_PRECISION + { + return Ok(JsonAst::Number(Number::Decimal64(Decimal64 { + scale: scale as u8, + value: i64::try_from(value).unwrap(), + }))); + } else if (DECIMAL128_MIN..=DECIMAL128_MAX).contains(&value) { + return Ok(JsonAst::Number(Number::Decimal128(Decimal128 { + scale: scale as u8, + value, + }))); + } } } // Second parsing strategy: Try to parse as i256 for very large numbers + #[cfg(feature = "arbitrary_precision")] if !has_exponent && precision <= MAX_DECIMAL256_PRECISION { - let end_idx = self.idx; - - // Reconstruct the string representation without the decimal point - let digit_str = if let Some(frac_idx) = fraction_offset { - let digit_len = end_idx - num_start_idx - 1; - let mut s = String::with_capacity(digit_len); - s.push_str(unsafe { - std::str::from_utf8_unchecked(&self.buf[num_start_idx..frac_idx]) - }); - s.push_str(unsafe { - std::str::from_utf8_unchecked(&self.buf[frac_idx + 1..end_idx]) - }); - s - } else { - unsafe { std::str::from_utf8_unchecked(&self.buf[num_start_idx..end_idx]) } - .to_string() - }; - - // Try to parse as i256 - if let Ok(mut value) = i256::from_str(&digit_str) { - if negative { - value = value.checked_neg().unwrap(); - } - return Ok(Value::Number(Number::Decimal256(Decimal256 { - scale: scale as u8, - value, - }))); + // Combine high value and low value to i256 value + let multiplier = POWER_TABLE[precision - MAX_DECIMAL128_PRECISION]; + let mut i256_value = i256::from(hi_value) * multiplier + i256::from(lo_value); + if negative { + i256_value *= -1; } + + return Ok(JsonAst::Number(Number::Decimal256(Decimal256 { + scale: scale as u8, + value: i256_value, + }))); } // Final fallback strategy: Parse as Float64 using fast_float2 library - // This handles cases like scientific notation and very large/small numbers + // This handles scientific notation and very large/small numbers let s = unsafe { std::str::from_utf8_unchecked(&self.buf[start_idx..self.idx]) }; match fast_float2::parse(s) { - Ok(v) => Ok(Value::Number(Number::Float64(v))), + Ok(v) => Ok(JsonAst::Number(Number::Float64(v))), Err(_) => Err(self.error(ParseErrorCode::InvalidNumberValue)), } } @@ -466,7 +781,7 @@ impl<'a> Parser<'a> { /// The implementation uses a two-pass approach for strings with escapes: /// - First pass: Count escapes and determine string boundaries /// - Second pass: Process escape sequences only when necessary - fn parse_json_string(&mut self) -> Result> { + fn parse_json_string(&mut self) -> Result> { // Ensure the string starts with a quote self.must_is(b'"')?; @@ -526,7 +841,44 @@ impl<'a> Parser<'a> { .map(Cow::Borrowed) .map_err(|_| self.error(ParseErrorCode::InvalidStringValue))? }; - Ok(Value::String(val)) + Ok(JsonAst::String(val)) + } + + /// Parse a JSON array with standard mode. + fn parse_standard_json_array(&mut self) -> Result> { + // Ensure the array starts with an opening bracket + self.must_is(b'[')?; + + let mut first = true; + let mut values = Vec::with_capacity(8); + + // Parse array elements until closing bracket is found + loop { + self.skip_unused(); + let c = self.next()?; + + // Check for end of array + if *c == b']' { + self.step(); + break; + } + + // Handle comma separator between elements (not for the first element) + if !first { + if *c != b',' { + return Err(self.error(ParseErrorCode::ExpectedArrayCommaOrEnd)); + } + self.step(); + } + first = false; + + self.skip_unused(); + + // Parse a regular array element + let value = self.parse_json_value()?; + values.push(value); + } + Ok(JsonAst::Array(values)) } /// Parse a JSON array with extended syntax support. @@ -539,12 +891,12 @@ impl<'a> Parser<'a> { /// Extended JSON array syntax support: /// - Empty elements between commas (e.g., [1,,3]) which standard JSON doesn't allow /// - Empty elements at the end of arrays (e.g., [1,2,]) which standard JSON doesn't allow - fn parse_json_array(&mut self) -> Result> { + fn parse_json_array(&mut self) -> Result> { // Ensure the array starts with an opening bracket self.must_is(b'[')?; let mut first = true; - let mut values = Vec::new(); + let mut values = Vec::with_capacity(8); // Parse array elements until closing bracket is found loop { @@ -572,7 +924,7 @@ impl<'a> Parser<'a> { // This is where the parser extends standard JSON by allowing empty elements if self.check_next_either(b',', b']') { // Insert null for empty element - values.push(Value::Null); + values.push(JsonAst::Null); continue; } @@ -580,7 +932,7 @@ impl<'a> Parser<'a> { let value = self.parse_json_value()?; values.push(value); } - Ok(Value::Array(values)) + Ok(JsonAst::Array(values)) } /// Parse a JSON object with key-value pairs. @@ -594,12 +946,12 @@ impl<'a> Parser<'a> { /// - Keys must be strings /// - Keys and values are separated by colons /// - Key-value pairs are separated by commas - fn parse_json_object(&mut self) -> Result> { + fn parse_json_object(&mut self) -> Result> { // Ensure the object starts with an opening brace self.must_is(b'{')?; let mut first = true; - let mut obj = Object::new(); + let mut obj = Vec::with_capacity(16); // Parse key-value pairs until closing brace is found loop { @@ -623,9 +975,10 @@ impl<'a> Parser<'a> { // Parse the key (must be a string) let key = self.parse_json_value()?; - if !key.is_string() { + let Some(key_str) = key.as_string() else { return Err(self.error(ParseErrorCode::KeyMustBeAString)); - } + }; + let pos = self.idx; self.skip_unused(); @@ -640,19 +993,27 @@ impl<'a> Parser<'a> { let value = self.parse_json_value()?; // Add the key-value pair to the object - // Note: This converts the key from a borrowed string to an owned string, - // which could be an optimization target for future improvements - let k = key.as_str().unwrap(); - obj.insert(k.to_string(), value); + obj.push((key_str, value, pos)); + } + + // Sort the Object fields by key and check for duplicate keys. + // Returns an error if duplicate keys are found. + obj.sort_by(|a, b| a.0.cmp(&b.0)); + for i in 1..obj.len() { + if obj[i - 1].0 == obj[i].0 { + let key_str = obj[i].0.clone().to_string(); + let pos = obj[i].2; + let code = ParseErrorCode::ObjectDuplicateKey(key_str); + return Err(Error::Syntax(code, pos)); + } } - Ok(Value::Object(obj)) + Ok(JsonAst::Object(obj)) } } #[cfg(test)] mod tests { use super::*; - use ethnum::i256; use proptest::prelude::*; fn string_strategy() -> impl Strategy { @@ -661,26 +1022,65 @@ mod tests { let cjk = '\u{4E00}'..='\u{9FFF}'; let chars: Vec = ascii.chain(cjk).collect(); - prop::collection::vec(prop::sample::select(chars), 1..30) + prop::collection::vec(prop::sample::select(chars), 1..50) .prop_map(|v| v.into_iter().collect()) } - fn json_strategy() -> impl Strategy> { - let leaf = prop_oneof![ - Just(Value::Null), - any::().prop_map(Value::Bool), - any::().prop_map(|v| Value::Number(Number::UInt64(v))), - any::().prop_map(|v| Value::Number(Number::Int64(v))), - any::().prop_filter("Exclude -0.0", |x| *x != -0.0).prop_map(|v| Value::Number(Number::Float64(v))), - (0u8..19u8, any::()).prop_map(|(scale, value)| Value::Number(Number::Decimal64(Decimal64 { scale, value }))), - (0u8..39u8, any::()).prop_map(|(scale, value)| Value::Number(Number::Decimal128(Decimal128 { scale, value }))), - (0u8..77u8, any::(), any::()).prop_filter("Exclude big i256", + fn standard_number_strategy() -> impl Strategy { + prop_oneof![ + any::().prop_map(Number::UInt64), + any::().prop_map(Number::Int64), + any::() + .prop_filter("Exclude -0.0", |x| *x != -0.0) + .prop_map(Number::Float64), + ] + } + + #[cfg(feature = "arbitrary_precision")] + fn number_strategy() -> impl Strategy { + use crate::Decimal128; + use crate::Decimal256; + use crate::Decimal64; + use ethnum::i256; + prop_oneof![ + any::().prop_map(Number::UInt64), + any::().prop_map(Number::Int64), + any::().prop_filter("Exclude -0.0", |x| *x != -0.0).prop_map(Number::Float64), + (0u8..=18u8, any::()).prop_map(|(scale, value)| Number::Decimal64(Decimal64 { scale, value })), + (0u8..=38u8, any::()).prop_map(|(scale, value)| Number::Decimal128(Decimal128 { scale, value })), + (0u8..=76u8, any::(), any::()).prop_filter("Exclude big i256", |(_, hi, lo)| { let val = i256::from_words(*hi, *lo); val >= ethnum::int!("-9999999999999999999999999999999999999999999999999999999999999999999999999999") && val <= ethnum::int!("9999999999999999999999999999999999999999999999999999999999999999999999999999") }) - .prop_map(|(scale, hi, lo)| Value::Number(Number::Decimal256(Decimal256 { scale, value: i256::from_words(hi, lo) }))), + .prop_map(|(scale, hi, lo)| Number::Decimal256(Decimal256 { scale, value: i256::from_words(hi, lo) })), + ] + } + + #[cfg(feature = "arbitrary_precision")] + fn json_strategy() -> impl Strategy> { + let leaf = prop_oneof![ + Just(Value::Null), + any::().prop_map(Value::Bool), + number_strategy().prop_map(Value::Number), + string_strategy().prop_map(|v| Value::String(Cow::Owned(v))), + ]; + + leaf.prop_recursive(8, 256, 30, |inner| { + prop_oneof![ + prop::collection::vec(inner.clone(), 0..10).prop_map(Value::Array), + prop::collection::btree_map(string_strategy(), inner, 0..20) + .prop_map(Value::Object), + ] + }) + } + + fn standard_json_strategy() -> impl Strategy> { + let leaf = prop_oneof![ + Just(Value::Null), + any::().prop_map(Value::Bool), + standard_number_strategy().prop_map(Value::Number), string_strategy().prop_map(|v| Value::String(Cow::Owned(v))), ]; @@ -695,18 +1095,38 @@ mod tests { proptest! { #[test] + #[cfg(feature = "arbitrary_precision")] fn test_json_parser(json in json_strategy()) { let source = format!("{}", json); - println!("source={}", source); - let res1 = serde_json::from_slice::(source.as_bytes()); let res2 = parse_value(source.as_bytes()); + let res3 = parse_owned_jsonb(source.as_bytes()); + assert_eq!(res1.is_ok(), res2.is_ok()); + assert_eq!(res1.is_ok(), res3.is_ok()); + if res1.is_ok() { + let res1 = format!("{}", res1.unwrap()); + let res2 = format!("{}", res2.unwrap()); + let res3 = format!("{}", res3.unwrap()); + assert_eq!(res1, res2); + assert_eq!(res1, res3); + } + } + } + + proptest! { + #[test] + fn test_standard_json_parser(json in standard_json_strategy()) { + let source = format!("{}", json); + let res1 = serde_json::from_slice::(source.as_bytes()); + let res2 = parse_value_standard_mode(source.as_bytes()); + let res3 = parse_owned_jsonb_standard_mode(source.as_bytes()); assert_eq!(res1.is_ok(), res2.is_ok()); - if res2.is_ok() { - let new_json = res2.unwrap(); - let result = format!("{}", new_json); - println!("result={}", result); - assert_eq!(source, result); + assert_eq!(res1.is_ok(), res3.is_ok()); + if res1.is_ok() { + let res2 = format!("{}", res2.unwrap()); + let res3 = format!("{}", res3.unwrap()); + assert_eq!(source, res2); + assert_eq!(source, res3); } } } diff --git a/tests/it/encode.rs b/tests/it/encode.rs index 8092c74..aa16386 100644 --- a/tests/it/encode.rs +++ b/tests/it/encode.rs @@ -171,10 +171,6 @@ fn test_encode_array() { b"\x80\0\0\x02\x30\0\0\0\x40\0\0\0", ); - let buf = Value::Array(vec![Value::Bool(false), Value::Bool(true)]).to_vec(); - let raw_jsonb = jsonb::RawJsonb::new(&buf); - println!("{}", raw_jsonb.to_string()); - assert_eq!( &Value::Array(vec![ Value::Bool(false), diff --git a/tests/it/functions.rs b/tests/it/functions.rs index d9d0ddd..d2d4b7c 100644 --- a/tests/it/functions.rs +++ b/tests/it/functions.rs @@ -1055,12 +1055,36 @@ fn test_strip_nulls() { } } +#[test] +#[cfg(feature = "arbitrary_precision")] +fn test_decimal_type_of() { + let sources = vec![ + (r#"-1.2"#, "DECIMAL"), + (r#"1.9120000000000001"#, "DECIMAL"), + ( + r#"99999999999999999999999999999999999999999999999999999999.99999999999999999999"#, + "DECIMAL", + ), + ( + r#"-9999999999999999999999999999999999999999999999999999999999999999999999999999"#, + "DECIMAL", + ), + ]; + + for (s, expect) in sources { + let owned_jsonb = s.parse::().unwrap(); + let raw_jsonb = owned_jsonb.as_raw(); + + let res = raw_jsonb.type_of(); + assert_eq!(res, Ok(expect)); + } +} + #[test] fn test_type_of() { let sources = vec![ (r#"null"#, "NULL_VALUE"), (r#"1"#, "INTEGER"), - (r#"-1.2"#, "DECIMAL"), (r#"1.912000000000000e+02"#, "DOUBLE"), (r#""test""#, "STRING"), (r#"[1,2,3,4,5]"#, "ARRAY"), diff --git a/tests/it/jsonpath_parser.rs b/tests/it/jsonpath_parser.rs index 84d3ba8..53eeeca 100644 --- a/tests/it/jsonpath_parser.rs +++ b/tests/it/jsonpath_parser.rs @@ -76,11 +76,11 @@ fn test_json_path() { let json_path = parse_json_path(case.as_bytes()).unwrap(); writeln!(file, "---------- Input ----------").unwrap(); - writeln!(file, "{}", case).unwrap(); + writeln!(file, "{case}").unwrap(); writeln!(file, "---------- Output ---------").unwrap(); - writeln!(file, "{}", json_path).unwrap(); + writeln!(file, "{json_path}").unwrap(); writeln!(file, "---------- AST ------------").unwrap(); - writeln!(file, "{:#?}", json_path).unwrap(); + writeln!(file, "{json_path:#?}").unwrap(); writeln!(file, "\n").unwrap(); } } diff --git a/tests/it/keypath_parser.rs b/tests/it/keypath_parser.rs index 02a0fd1..85bb6e4 100644 --- a/tests/it/keypath_parser.rs +++ b/tests/it/keypath_parser.rs @@ -27,11 +27,11 @@ fn test_json_path() { let key_paths = parse_key_paths(case.as_bytes()).unwrap(); writeln!(file, "---------- Input ----------").unwrap(); - writeln!(file, "{}", case).unwrap(); + writeln!(file, "{case}").unwrap(); writeln!(file, "---------- Output ---------").unwrap(); - writeln!(file, "{}", key_paths).unwrap(); + writeln!(file, "{key_paths}").unwrap(); writeln!(file, "---------- AST ------------").unwrap(); - writeln!(file, "{:#?}", key_paths).unwrap(); + writeln!(file, "{key_paths:#?}").unwrap(); writeln!(file, "\n").unwrap(); } } diff --git a/tests/it/number.rs b/tests/it/number.rs index 5ed958c..d93c0ff 100644 --- a/tests/it/number.rs +++ b/tests/it/number.rs @@ -34,19 +34,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::cmp::Ordering::*; - use jsonb::RawJsonb; -use jsonb::{parse_value, Number, Value}; +use jsonb::{parse_value, parse_value_standard_mode}; +use ordered_float::OrderedFloat; #[test] +#[cfg(feature = "arbitrary_precision")] fn it_cmps_decimals() { + use core::cmp::Ordering::*; + use jsonb::{Number, Value}; + fn cmp(a: &str, b: &str, c: core::cmp::Ordering) { let v1 = parse_value(a.as_bytes()).unwrap(); let v2 = parse_value(b.as_bytes()).unwrap(); - let s1 = format!("{}", v1); - let s2 = format!("{}", v2); + let s1 = format!("{v1}"); + let s2 = format!("{v2}"); let buf1 = v1.to_vec(); let buf2 = v2.to_vec(); @@ -2164,16 +2167,29 @@ fn it_cmps_decimals() { #[test] fn test_parse_decimal() { let tests = [ - ("-999999999999999999"), - ("999999999999999999"), - ("-999999999.99999999999999999999999999999"), - ("-99999999999999999999999999999999999999"), - ("9999999999999999999999999.9999999999999"), - ("99999999999999999999999999999999999999"), - ("-999999999999999999999999999999999999999999999999999.9999999999999999999999999"), - ("-9999999999999999999999999999999999999999999999999999999999999999999999999999"), - ("99999999999999999999999999999999999.99999999999999999999999999999999999999999"), - ("9999999999999999999999999999999999999999999999999999999999999999999999999999"), + "10000000000000000000000000000000000000", + "100000000000000000000000000000000000004", + "1.00000000000000000000000000000000000004", + "123.4500000000000000000000000000000000000004", + "123.0045000000000000000000000000000000000004", + "123450000000000000000000.0000000000000000004", + "0.001234500000000000000000000000000000000004", + "-123456789012345678", + "123456789012345678", + "-123456789012.34567890123456789012345678", + "12345678901234567890123456.789012345678", + "-12345678901234567890123456789012345678901234567890123456789.01234567890123456", + "1234567890123.456789012345678901234567890123456789012345678901234567890123456", + "-999999999999999999", + "999999999999999999", + "-999999999.99999999999999999999999999999", + "-99999999999999999999999999999999999999", + "9999999999999999999999999.9999999999999", + "99999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999.9999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999.99999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999999999999999999", "1.123456", "-0.123456", "-1.00", @@ -2220,9 +2236,10 @@ fn test_parse_decimal() { "317000006395220278118691742155288870912", ]; + #[cfg(feature = "arbitrary_precision")] for test in tests { let v = parse_value(test.as_bytes()).unwrap(); - let s = format!("{}", v); + let s = format!("{v}"); let buf = v.to_vec(); let r = RawJsonb::new(&buf); let ss = r.to_string(); @@ -2230,6 +2247,19 @@ fn test_parse_decimal() { assert_eq!(test, s); assert_eq!(test, ss); } + // standard json + for test in tests { + let v = parse_value_standard_mode(test.as_bytes()).unwrap(); + let s = OrderedFloat(v.as_f64().unwrap()); + let buf = v.to_vec(); + let r = RawJsonb::new(&buf); + let ss = OrderedFloat(r.as_f64().unwrap().unwrap()); + let val: f64 = fast_float2::parse(test).unwrap(); + let expected = OrderedFloat(val); + + assert_eq!(expected, s); + assert_eq!(expected, ss); + } } #[test] @@ -2253,7 +2283,19 @@ fn test_parse_float() { for (expected, test) in tests { let v = parse_value(test.as_bytes()).unwrap(); - let s = format!("{}", v); + let s = format!("{v}"); + let buf = v.to_vec(); + let r = RawJsonb::new(&buf); + let ss = r.to_string(); + + assert_eq!(expected, s); + assert_eq!(expected, ss); + } + + // standard json + for (expected, test) in tests { + let v = parse_value_standard_mode(test.as_bytes()).unwrap(); + let s = format!("{v}"); let buf = v.to_vec(); let r = RawJsonb::new(&buf); let ss = r.to_string(); diff --git a/tests/it/parser.rs b/tests/it/parser.rs index 3932a2a..9313fc1 100644 --- a/tests/it/parser.rs +++ b/tests/it/parser.rs @@ -14,7 +14,7 @@ use std::borrow::Cow; -use jsonb::{parse_value, Number, Object, Value}; +use jsonb::{parse_value, parse_value_standard_mode, Number, Object, Value}; fn test_parse_err(errors: &[(&str, &'static str)]) { for &(s, err) in errors { @@ -30,6 +30,20 @@ fn test_parse_ok(tests: Vec<(&str, Value<'_>)>) { } } +fn test_parse_standard_err(errors: &[(&str, &'static str)]) { + for &(s, err) in errors { + let res = parse_value_standard_mode(s.as_bytes()); + assert!(res.is_err()); + assert_eq!(res.err().unwrap().to_string(), err); + } +} + +fn test_parse_standard_ok(tests: Vec<(&str, Value<'_>)>) { + for (s, val) in tests { + assert_eq!(parse_value_standard_mode(s.as_bytes()).unwrap(), val); + } +} + #[test] fn test_parse_null() { test_parse_err(&[ @@ -62,9 +76,9 @@ fn test_parse_boolean() { #[test] fn test_parse_number_errors() { test_parse_err(&[ - ("+", "expected value, pos 1"), - (".", "expected value, pos 1"), - ("-", "expected value, pos 1"), + ("+", "invalid number, pos 1"), + (".", "invalid number, pos 1"), + ("-", "invalid number, pos 1"), ("0x80", "trailing characters, pos 2"), ("\\0", "expected value, pos 1"), ("1.a", "trailing characters, pos 3"), @@ -72,41 +86,62 @@ fn test_parse_number_errors() { ("1e+", "invalid number, pos 3"), ("1a", "trailing characters, pos 2"), ]); + + test_parse_standard_err(&[ + ("+", "invalid number, pos 1"), + (".", "invalid number, pos 1"), + ("-", "invalid number, pos 1"), + ("0x80", "trailing characters, pos 2"), + ("\\0", "expected value, pos 1"), + ("1.a", "invalid number, pos 3"), + ("1e", "invalid number, pos 2"), + ("1e+", "invalid number, pos 3"), + ("1a", "trailing characters, pos 2"), + // Extended JSON number syntax return error in standard mode + ("+1", "invalid number, pos 1"), + ("00", "invalid number, pos 2"), + (".0", "invalid number, pos 1"), + ("0.", "invalid number, pos 3"), + ("1.", "invalid number, pos 3"), + ("1.e1", "invalid number, pos 3"), + ]); } #[test] fn test_parse_i64() { - test_parse_ok(vec![ + let i64_min = i64::MIN.to_string(); + let i64_max = i64::MAX.to_string(); + let tests = vec![ ("-2", Value::Number(Number::Int64(-2))), ("-1234", Value::Number(Number::Int64(-1234))), (" -1234 ", Value::Number(Number::Int64(-1234))), - ( - &i64::MIN.to_string(), - Value::Number(Number::Int64(i64::MIN)), - ), - ( - &i64::MAX.to_string(), - Value::Number(Number::UInt64(i64::MAX as u64)), - ), - ]); + (&i64_min, Value::Number(Number::Int64(i64::MIN))), + (&i64_max, Value::Number(Number::UInt64(i64::MAX as u64))), + ]; + test_parse_ok(tests.clone()); + test_parse_standard_ok(tests); } #[test] fn test_parse_u64() { - test_parse_ok(vec![ + let u64_max = u64::MAX.to_string(); + let tests = vec![ ("0", Value::Number(Number::UInt64(0u64))), ("3", Value::Number(Number::UInt64(3u64))), ("1234", Value::Number(Number::UInt64(1234))), - ( - &u64::MAX.to_string(), - Value::Number(Number::UInt64(u64::MAX)), - ), - ]); + (&u64_max, Value::Number(Number::UInt64(u64::MAX))), + ]; + test_parse_ok(tests.clone()); + test_parse_standard_ok(tests); } #[test] fn test_parse_f64() { - test_parse_ok(vec![ + let i64_min_minus_one = format!("{}", (i64::MIN as f64) - 1.0); + let u64_max_plus_one = format!("{}", (u64::MAX as f64) + 1.0); + let epsilon = format!("{}", f64::EPSILON); + + let tests = vec![ ( "100e777777777777777777777777777", Value::Number(Number::Float64(f64::INFINITY)), @@ -170,17 +205,14 @@ fn test_parse_f64() { Value::Number(Number::Float64(0.01)), ), ( - &format!("{}", (i64::MIN as f64) - 1.0), + &i64_min_minus_one, Value::Number(Number::Float64((i64::MIN as f64) - 1.0)), ), ( - &format!("{}", (u64::MAX as f64) + 1.0), + &u64_max_plus_one, Value::Number(Number::Float64((u64::MAX as f64) + 1.0)), ), - ( - &format!("{}", f64::EPSILON), - Value::Number(Number::Float64(f64::EPSILON)), - ), + (&epsilon, Value::Number(Number::Float64(f64::EPSILON))), ( "0.0000000000000000000000000000000000000000000000000123e50", Value::Number(Number::Float64(1.23)), @@ -237,14 +269,20 @@ fn test_parse_f64() { 000000000000000000e-10", Value::Number(Number::Float64(1e308)), ), - // Extended JSON number syntax + ]; + test_parse_ok(tests.clone()); + test_parse_standard_ok(tests); + + // Extended JSON number syntax + let extended_tests = vec![ ("+1", Value::Number(Number::Int64(1))), ("00", Value::Number(Number::UInt64(0))), (".0", Value::Number(Number::UInt64(0))), ("0.", Value::Number(Number::UInt64(0))), ("1.", Value::Number(Number::UInt64(1))), ("1.e1", Value::Number(Number::Float64(10.0))), - ]); + ]; + test_parse_ok(extended_tests); } #[test] @@ -319,7 +357,20 @@ fn test_parse_array() { ("[]a", "trailing characters, pos 3"), ]); - test_parse_ok(vec![ + test_parse_standard_err(&[ + ("[", "EOF while parsing a value, pos 1"), + ("[ ", "EOF while parsing a value, pos 2"), + ("[1", "EOF while parsing a value, pos 2"), + ("[1,", "EOF while parsing a value, pos 3"), + ("[1 2]", "expected `,` or `]`, pos 3"), + ("[]a", "trailing characters, pos 3"), + // Extended JSON array syntax return error in standard mode + ("[1, ]", "expected value, pos 5"), + ("[ , 2, 3]", "expected value, pos 3"), + ("[ , ]", "expected value, pos 3"), + ]); + + let tests = vec![ ("[]", Value::Array(vec![])), ("[ ]", Value::Array(vec![])), ("[null]", Value::Array(vec![Value::Null])), @@ -368,7 +419,13 @@ fn test_parse_array() { ]), ]), ), - // Extended JSON array syntax + ]; + + test_parse_ok(tests.clone()); + test_parse_standard_ok(tests); + + // Extended JSON array syntax + let extended_tests = vec![ ( "[1, ]", Value::Array(vec![Value::Number(Number::UInt64(1)), Value::Null]), @@ -382,7 +439,8 @@ fn test_parse_array() { ]), ), ("[ , ]", Value::Array(vec![Value::Null, Value::Null])), - ]); + ]; + test_parse_ok(extended_tests); } #[test] @@ -400,6 +458,10 @@ fn test_parse_object() { ("{\"a\":1 1", "expected `,` or `}`, pos 7"), ("{\"a\":1,", "EOF while parsing a value, pos 7"), ("{}a", "trailing characters, pos 3"), + ( + "{\"k\":\"v\",\"k\":\"v2\"}", + "duplicate object attribute \"k\", pos 12", + ), ]); let mut obj1 = Object::new(); From 61547341c3081f7eebde202004bb3d5c491bc918 Mon Sep 17 00:00:00 2001 From: baishen Date: Fri, 1 Aug 2025 08:59:46 +0800 Subject: [PATCH 2/3] number cast --- src/number.rs | 60 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/src/number.rs b/src/number.rs index ded0170..e62e833 100644 --- a/src/number.rs +++ b/src/number.rs @@ -359,16 +359,40 @@ impl Number { match self { Number::Int64(v) => Some(*v), Number::UInt64(v) => { - if *v <= i64::MAX.try_into().unwrap() { + if *v <= i64::MAX as u64 { Some(*v as i64) } else { None } } - Number::Float64(_) - | Number::Decimal64(_) - | Number::Decimal128(_) - | Number::Decimal256(_) => None, + Number::Float64(_) => None, + Number::Decimal64(v) => { + if v.scale == 0 { + Some(v.value) + } else { + None + } + } + Number::Decimal128(v) => { + if v.scale == 0 + && v.value >= i128::from(i64::MIN) + && v.value <= i128::from(i64::MAX) + { + Some(v.value as i64) + } else { + None + } + } + Number::Decimal256(v) => { + if v.scale == 0 + && v.value >= i256::from(i64::MIN) + && v.value <= i256::from(i64::MAX) + { + Some(v.value.as_i64()) + } else { + None + } + } } } @@ -385,10 +409,28 @@ impl Number { } } Number::UInt64(v) => Some(*v), - Number::Float64(_) - | Number::Decimal64(_) - | Number::Decimal128(_) - | Number::Decimal256(_) => None, + Number::Float64(_) => None, + Number::Decimal64(v) => { + if v.scale == 0 && v.value >= 0 { + Some(v.value as u64) + } else { + None + } + } + Number::Decimal128(v) => { + if v.scale == 0 && v.value >= 0 && v.value <= i128::from(u64::MAX) { + Some(v.value as u64) + } else { + None + } + } + Number::Decimal256(v) => { + if v.scale == 0 && v.value >= i256::ZERO && v.value <= i256::from(u64::MAX) { + Some(v.value.as_u64()) + } else { + None + } + } } } From 8abb035cc60e9f2653645d67a5151e4de884660e Mon Sep 17 00:00:00 2001 From: baishen Date: Sat, 2 Aug 2025 10:50:45 +0800 Subject: [PATCH 3/3] fix --- src/parser.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index b6bc9f9..9259a51 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -158,12 +158,8 @@ impl<'a> JsonAst<'a> { } JsonAst::Object(kvs) => { let mut object = Object::new(); - for (key, val, pos) in kvs.into_iter() { + for (key, val, _) in kvs.into_iter() { let key_str = key.to_string(); - if object.contains_key(&key_str) { - let code = ParseErrorCode::ObjectDuplicateKey(key_str); - return Err(Error::Syntax(code, pos)); - } let value = val.into_value()?; object.insert(key_str, value); }