Skip to content

Commit a30d0d5

Browse files
authored
feat(Rust): Implementing Type Compatible (apache#2492)
## What does this PR do? - Modify according to the latest `xlang` specification. (Implemented but not tested with `Java`) - Improved the implementation of compatible mode by comparing the `field_name` and `field_type` of the sender and receiver to determine whether to assign or skip. ## TODO - Haven’t figured out how to serialize and deserialize `Box` yet. - Handling of built-in unsigned types. (I will add type_id like `125=>u8` to implement se/deserialize for `rust to rust`) - Special handling of Option field types when extracting the generic structure of field types in macros. - In deserialization, for fields that do not exist locally and need to be skipped, automatically create a deserializer for the corresponding type and consume the byte data. ## Related issues - apache#2145 ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fory/issues/new/choose) describing the need to do so and update the document if necessary. --> - [ ] Does this PR introduce any public API change? - [x] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. -->
1 parent 0ae438d commit a30d0d5

27 files changed

Lines changed: 992 additions & 250 deletions

rust/fory-core/src/buffer.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -93,24 +93,24 @@ impl Writer {
9393
} else if value >> 14 == 0 {
9494
let u1 = (value & 0x7F) | 0x80;
9595
let u2 = value >> 7;
96-
self.u16(((u1 << 8) | u2) as u16);
96+
self.u16(((u2 << 8) | u1) as u16);
9797
} else if value >> 21 == 0 {
9898
let u1 = (value & 0x7F) | 0x80;
99-
let u2 = (value >> 7) | 0x80;
100-
self.u16(((u1 << 8) | u2) as u16);
99+
let u2 = ((value >> 7) & 0x7F) | 0x80;
100+
self.u16(((u2 << 8) | u1) as u16);
101101
self.u8((value >> 14) as u8);
102102
} else if value >> 28 == 0 {
103103
let u1 = (value & 0x7F) | 0x80;
104-
let u2 = (value >> 7) | 0x80;
105-
let u3 = (value >> 14) | 0x80;
106-
let u4 = (value >> 21) | 0x80;
107-
self.u32(((u1 << 24) | (u2 << 16) | (u3 << 8) | u4) as u32);
104+
let u2 = ((value >> 7) & 0x7F) | 0x80;
105+
let u3 = ((value >> 14) & 0x7F) | 0x80;
106+
let u4 = value >> 21;
107+
self.u32(((u4 << 24) | (u3 << 16) | (u2 << 8) | u1) as u32);
108108
} else {
109109
let u1 = (value & 0x7F) | 0x80;
110-
let u2 = (value >> 7) | 0x80;
111-
let u3 = (value >> 14) | 0x80;
112-
let u4 = (value >> 21) | 0x80;
113-
self.u32(((u1 << 24) | (u2 << 16) | (u3 << 8) | u4) as u32);
110+
let u2 = ((value >> 7) & 0x7F) | 0x80;
111+
let u3 = ((value >> 14) & 0x7F) | 0x80;
112+
let u4 = ((value >> 21) & 0x7F) | 0x80;
113+
self.u32(((u4 << 24) | (u3 << 16) | (u2 << 8) | u1) as u32);
114114
self.u8((value >> 28) as u8);
115115
}
116116
}

rust/fory-core/src/fory.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,22 @@
1717

1818
use crate::buffer::{Reader, Writer};
1919
use crate::error::Error;
20-
use crate::resolver::class_resolver::{ClassInfo, ClassResolver};
2120
use crate::resolver::context::ReadContext;
2221
use crate::resolver::context::WriteContext;
22+
use crate::resolver::type_resolver::{TypeInfo, TypeResolver};
2323
use crate::serializer::{Serializer, StructSerializer};
2424
use crate::types::{config_flags, Language, Mode, SIZE_OF_REF_AND_TYPE};
2525

2626
pub struct Fory {
2727
mode: Mode,
28-
class_resolver: ClassResolver,
28+
type_resolver: TypeResolver,
2929
}
3030

3131
impl Default for Fory {
3232
fn default() -> Self {
3333
Fory {
3434
mode: Mode::SchemaConsistent,
35-
class_resolver: ClassResolver::default(),
35+
type_resolver: TypeResolver::default(),
3636
}
3737
}
3838
}
@@ -78,20 +78,20 @@ impl Fory {
7878
pub fn serialize<T: Serializer>(&self, record: &T) -> Vec<u8> {
7979
let mut writer = Writer::default();
8080
let meta_offset = self.write_head::<T>(&mut writer);
81-
let mut context = WriteContext::new(self, &mut writer);
81+
let mut context: WriteContext<'_> = WriteContext::new(self, &mut writer);
8282
<T as Serializer>::serialize(record, &mut context);
8383
if Mode::Compatible == self.mode {
8484
context.write_meta(meta_offset);
8585
}
8686
writer.dump()
8787
}
8888

89-
pub fn get_class_resolver(&self) -> &ClassResolver {
90-
&self.class_resolver
89+
pub fn get_type_resolver(&self) -> &TypeResolver {
90+
&self.type_resolver
9191
}
9292

9393
pub fn register<T: 'static + StructSerializer>(&mut self, id: u32) {
94-
let class_info = ClassInfo::new::<T>(self, id);
95-
self.class_resolver.register::<T>(class_info, id);
94+
let type_info = TypeInfo::new::<T>(self, id);
95+
self.type_resolver.register::<T>(type_info, id);
9696
}
9797
}

rust/fory-core/src/meta/meta_string.rs

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,9 @@ impl Default for MetaStringDecoder {
6565
}
6666
}
6767

68-
pub struct MetaStringEncoder {}
69-
impl Default for MetaStringEncoder {
70-
fn default() -> Self {
71-
Self::new()
72-
}
68+
#[derive(Default)]
69+
pub struct MetaStringEncoder<'a> {
70+
encoding_options: Option<&'a [Encoding]>,
7371
}
7472

7573
#[derive(Debug)]
@@ -80,9 +78,13 @@ struct StringStatistics {
8078
can_lower_special_encoded: bool,
8179
}
8280

83-
impl MetaStringEncoder {
81+
impl<'a> MetaStringEncoder<'a> {
8482
pub fn new() -> Self {
85-
MetaStringEncoder {}
83+
Self::default()
84+
}
85+
pub fn set_options(mut self, encoding_options: Option<&'a [Encoding]>) -> Self {
86+
self.encoding_options = encoding_options;
87+
self
8688
}
8789

8890
fn is_latin(&self, s: &str) -> bool {
@@ -108,22 +110,30 @@ impl MetaStringEncoder {
108110
}
109111

110112
fn compute_encoding(&self, input: &str) -> Encoding {
113+
let allow = |e: Encoding| self.encoding_options.map_or(true, |opts| opts.contains(&e));
111114
let statistics = self.compute_statistics(input);
112-
if statistics.can_lower_special_encoded {
115+
if statistics.can_lower_special_encoded && allow(Encoding::LowerSpecial) {
113116
return Encoding::LowerSpecial;
114117
}
115118
if statistics.can_lower_upper_digit_special_encoded {
116-
if statistics.digit_count != 0 {
119+
if statistics.digit_count != 0 && allow(Encoding::LowerUpperDigitSpecial) {
117120
return Encoding::LowerUpperDigitSpecial;
118121
}
119122
let upper_count: usize = statistics.upper_count;
120-
if upper_count == 1 && input.chars().next().unwrap().is_uppercase() {
123+
if upper_count == 1
124+
&& input.chars().next().unwrap().is_uppercase()
125+
&& allow(Encoding::FirstToLowerSpecial)
126+
{
121127
return Encoding::FirstToLowerSpecial;
122128
}
123-
if ((input.len() + upper_count) * 5) < (input.len() * 6) {
129+
if ((input.len() + upper_count) * 5) < (input.len() * 6)
130+
&& allow(Encoding::AllToLowerSpecial)
131+
{
124132
return Encoding::AllToLowerSpecial;
125133
}
126-
return Encoding::LowerUpperDigitSpecial;
134+
if allow(Encoding::LowerUpperDigitSpecial) {
135+
return Encoding::LowerUpperDigitSpecial;
136+
}
127137
}
128138
Encoding::Utf8
129139
}

rust/fory-core/src/meta/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ mod string_util;
2020
mod type_meta;
2121

2222
pub use meta_string::{Encoding, MetaString, MetaStringDecoder, MetaStringEncoder};
23-
pub use type_meta::{FieldInfo, TypeMeta};
23+
pub use string_util::murmurhash3_x64_128;
24+
pub use type_meta::{FieldInfo, FieldType, TypeMeta, TypeMetaLayer};

rust/fory-core/src/meta/string_util.rs

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::mem;
19+
1820
#[cfg(target_feature = "neon")]
1921
use std::arch::aarch64::*;
2022

@@ -187,3 +189,198 @@ mod tests {
187189
assert!(!is_latin_standard(&not_latin_str));
188190
}
189191
}
192+
193+
fn fmix64(mut k: u64) -> u64 {
194+
k ^= k >> 33;
195+
k = k.wrapping_mul(0xff51afd7ed558ccdu64);
196+
k ^= k >> 33;
197+
k = k.wrapping_mul(0xc4ceb9fe1a85ec53u64);
198+
k ^= k >> 33;
199+
200+
k
201+
}
202+
203+
pub fn murmurhash3_x64_128(bytes: &[u8], seed: u64) -> (u64, u64) {
204+
let c1 = 0x87c37b91114253d5u64;
205+
let c2 = 0x4cf5ad432745937fu64;
206+
let read_size = 16;
207+
let len = bytes.len() as u64;
208+
let block_count = len / read_size;
209+
210+
let (mut h1, mut h2) = (seed, seed);
211+
212+
for i in 0..block_count as usize {
213+
let b64: &[u64] = unsafe { mem::transmute(bytes) };
214+
let (mut k1, mut k2) = (b64[i * 2], b64[i * 2 + 1]);
215+
216+
k1 = k1.wrapping_mul(c1);
217+
k1 = k1.rotate_left(31);
218+
k1 = k1.wrapping_mul(c2);
219+
h1 ^= k1;
220+
221+
h1 = h1.rotate_left(27);
222+
h1 = h1.wrapping_add(h2);
223+
h1 = h1.wrapping_mul(5);
224+
h1 = h1.wrapping_add(0x52dce729);
225+
226+
k2 = k2.wrapping_mul(c2);
227+
k2 = k2.rotate_left(33);
228+
k2 = k2.wrapping_mul(c1);
229+
h2 ^= k2;
230+
231+
h2 = h2.rotate_left(31);
232+
h2 = h2.wrapping_add(h1);
233+
h2 = h2.wrapping_mul(5);
234+
h2 = h2.wrapping_add(0x38495ab5);
235+
}
236+
let (mut k1, mut k2) = (0u64, 0u64);
237+
238+
if len & 15 == 15 {
239+
k2 ^= (bytes[(block_count * read_size) as usize + 14] as u64) << 48;
240+
}
241+
if len & 15 >= 14 {
242+
k2 ^= (bytes[(block_count * read_size) as usize + 13] as u64) << 40;
243+
}
244+
if len & 15 >= 13 {
245+
k2 ^= (bytes[(block_count * read_size) as usize + 12] as u64) << 32;
246+
}
247+
if len & 15 >= 12 {
248+
k2 ^= (bytes[(block_count * read_size) as usize + 11] as u64) << 24;
249+
}
250+
if len & 15 >= 11 {
251+
k2 ^= (bytes[(block_count * read_size) as usize + 10] as u64) << 16;
252+
}
253+
if len & 15 >= 10 {
254+
k2 ^= (bytes[(block_count * read_size) as usize + 9] as u64) << 8;
255+
}
256+
if len & 15 >= 9 {
257+
k2 ^= bytes[(block_count * read_size) as usize + 8] as u64;
258+
k2 = k2.wrapping_mul(c2);
259+
k2 = k2.rotate_left(33);
260+
k2 = k2.wrapping_mul(c1);
261+
h2 ^= k2;
262+
}
263+
264+
if len & 15 >= 8 {
265+
k1 ^= (bytes[(block_count * read_size) as usize + 7] as u64) << 56;
266+
}
267+
if len & 15 >= 7 {
268+
k1 ^= (bytes[(block_count * read_size) as usize + 6] as u64) << 48;
269+
}
270+
if len & 15 >= 6 {
271+
k1 ^= (bytes[(block_count * read_size) as usize + 5] as u64) << 40;
272+
}
273+
if len & 15 >= 5 {
274+
k1 ^= (bytes[(block_count * read_size) as usize + 4] as u64) << 32;
275+
}
276+
if len & 15 >= 4 {
277+
k1 ^= (bytes[(block_count * read_size) as usize + 3] as u64) << 24;
278+
}
279+
if len & 15 >= 3 {
280+
k1 ^= (bytes[(block_count * read_size) as usize + 2] as u64) << 16;
281+
}
282+
if len & 15 >= 2 {
283+
k1 ^= (bytes[(block_count * read_size) as usize + 1] as u64) << 8;
284+
}
285+
if len & 15 >= 1 {
286+
k1 ^= bytes[(block_count * read_size) as usize] as u64;
287+
k1 = k1.wrapping_mul(c1);
288+
k1 = k1.rotate_left(31);
289+
k1 = k1.wrapping_mul(c2);
290+
h1 ^= k1;
291+
}
292+
293+
h1 ^= bytes.len() as u64;
294+
h2 ^= bytes.len() as u64;
295+
296+
h1 = h1.wrapping_add(h2);
297+
h2 = h2.wrapping_add(h1);
298+
299+
h1 = fmix64(h1);
300+
h2 = fmix64(h2);
301+
302+
h1 = h1.wrapping_add(h2);
303+
h2 = h2.wrapping_add(h1);
304+
305+
(h1, h2)
306+
}
307+
308+
#[cfg(test)]
309+
mod test_hash {
310+
use super::murmurhash3_x64_128;
311+
312+
#[test]
313+
fn test_empty_string() {
314+
assert!(murmurhash3_x64_128("".as_bytes(), 0) == (0, 0));
315+
}
316+
317+
#[test]
318+
fn test_tail_lengths() {
319+
assert!(
320+
murmurhash3_x64_128("1".as_bytes(), 0) == (8213365047359667313, 10676604921780958775)
321+
);
322+
assert!(
323+
murmurhash3_x64_128("12".as_bytes(), 0) == (5355690773644049813, 9855895140584599837)
324+
);
325+
assert!(
326+
murmurhash3_x64_128("123".as_bytes(), 0) == (10978418110857903978, 4791445053355511657)
327+
);
328+
assert!(
329+
murmurhash3_x64_128("1234".as_bytes(), 0) == (619023178690193332, 3755592904005385637)
330+
);
331+
assert!(
332+
murmurhash3_x64_128("12345".as_bytes(), 0)
333+
== (2375712675693977547, 17382870096830835188)
334+
);
335+
assert!(
336+
murmurhash3_x64_128("123456".as_bytes(), 0)
337+
== (16435832985690558678, 5882968373513761278)
338+
);
339+
assert!(
340+
murmurhash3_x64_128("1234567".as_bytes(), 0)
341+
== (3232113351312417698, 4025181827808483669)
342+
);
343+
assert!(
344+
murmurhash3_x64_128("12345678".as_bytes(), 0)
345+
== (4272337174398058908, 10464973996478965079)
346+
);
347+
assert!(
348+
murmurhash3_x64_128("123456789".as_bytes(), 0)
349+
== (4360720697772133540, 11094893415607738629)
350+
);
351+
assert!(
352+
murmurhash3_x64_128("123456789a".as_bytes(), 0)
353+
== (12594836289594257748, 2662019112679848245)
354+
);
355+
assert!(
356+
murmurhash3_x64_128("123456789ab".as_bytes(), 0)
357+
== (6978636991469537545, 12243090730442643750)
358+
);
359+
assert!(
360+
murmurhash3_x64_128("123456789abc".as_bytes(), 0)
361+
== (211890993682310078, 16480638721813329343)
362+
);
363+
assert!(
364+
murmurhash3_x64_128("123456789abcd".as_bytes(), 0)
365+
== (12459781455342427559, 3193214493011213179)
366+
);
367+
assert!(
368+
murmurhash3_x64_128("123456789abcde".as_bytes(), 0)
369+
== (12538342858731408721, 9820739847336455216)
370+
);
371+
assert!(
372+
murmurhash3_x64_128("123456789abcdef".as_bytes(), 0)
373+
== (9165946068217512774, 2451472574052603025)
374+
);
375+
assert!(
376+
murmurhash3_x64_128("123456789abcdef1".as_bytes(), 0)
377+
== (9259082041050667785, 12459473952842597282)
378+
);
379+
}
380+
381+
#[test]
382+
fn test_large_data() {
383+
assert!(murmurhash3_x64_128("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam at consequat massa. Cras eleifend pellentesque ex, at dignissim libero maximus ut. Sed eget nulla felis".as_bytes(), 0)
384+
== (9455322759164802692, 17863277201603478371));
385+
}
386+
}

0 commit comments

Comments
 (0)