Skip to content

Commit 98efd72

Browse files
authored
feat(java/python): new xlang type system spec implementation (#1690)
## What does this PR do? This PR implements a new [type system](https://fury.apache.org/docs/specification/fury_xlang_serialization_spec/#type-systems) for xlang serialization between java and python. The changes includes: - Refine type system spec: added new types: - named_enum: an enum whose value will be serialized as the registered name. - struct: a morphic(final) type serialized by Fury Struct serializer. - polymorphic_struct: a type which is not morphic(not final). i.e. it don't have subclasses. Suppose we're deserializing `List<SomeClass>`, we can save dynamic serializer dispatch if `SomeClass` is morphic(final). - compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. - polymorphic_compatible_struct: a non-morphic(non-final) type serialized by Fury compatible Struct serializer. - named_struct: a `struct` whose type mapping will be encoded as a name. - named_polymorphic_struct: a `polymorphic_struct` whose type mapping will be encoded as a name. - named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. - named_polymorphic_compatible_struct: a `polymorphic_compatible_struct` whose type mapping will be encoded as a name. - ext: a type which will be serialized by a customized serializer. - polymorphic_ext: an `ext` type which is not morphic(not final). - named_ext: an `ext` type whose type mapping will be encoded as a name. - named_polymorphic_ext: an `polymorphic_ext` type whose type mapping will be encoded as a name. - Added a new XtypeResolver in java to resolve xlang types - Support register class mapping by id. Before this PR, we only support register class by name, which is more expensive at space/performance cost. - Support pass type into to resolve type ambiguation such as `ArrayList/Object[]` in java. Users can `serialize(List.of(1, 2, ,3))` and deserialize it into array by `deserialize(bytes, Integer[].class)` - Refactor pyfury serialization by moving type resolver into python code from cython, this will make debug more easy and reduce code duplciation, it also speed serialization performance. - golang xtype serialization test are disabled, it will be reenabled after new type system is implemented in golang ## Related issues <!-- Is there any related issue? Please attach here. - #xxxx0 - #xxxx1 - #xxxx2 --> ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. -->
1 parent 8d2d124 commit 98efd72

82 files changed

Lines changed: 4105 additions & 3689 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ pyx_library(
6262
),
6363
deps = [
6464
"//cpp/fury/util:fury_util",
65+
"//cpp/fury/type:fury_type",
6566
"@com_google_absl//absl/container:flat_hash_map",
6667
],
6768
)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ class SomeClass:
278278
f3: Dict[str, str]
279279

280280
fury = pyfury.Fury(ref_tracking=True)
281-
fury.register_class(SomeClass, type_tag="example.SomeClass")
281+
fury.register_type(SomeClass, typename="example.SomeClass")
282282
obj = SomeClass()
283283
obj.f2 = {"k1": "v1", "k2": "v2"}
284284
obj.f1, obj.f3 = obj, obj.f2

cpp/fury/type/BUILD

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
2+
3+
cc_library(
4+
name = "fury_type",
5+
srcs = glob(["*.cc"], exclude=["*test.cc"]),
6+
hdrs = glob(["*.h"]),
7+
copts = ["-mavx2"], # Enable AVX2 support
8+
linkopts = ["-mavx2"], # Ensure linker also knows about AVX2
9+
strip_include_prefix = "/cpp",
10+
alwayslink=True,
11+
linkstatic=True,
12+
deps = [
13+
],
14+
visibility = ["//visibility:public"],
15+
)

cpp/fury/type/type.h

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <cstdint> // For fixed-width integer types
21+
22+
namespace fury {
23+
enum class TypeId : int32_t {
24+
// a boolean value (true or false).
25+
BOOL = 1,
26+
// a 8-bit signed integer.
27+
INT8 = 2,
28+
// a 16-bit signed integer.
29+
INT16 = 3,
30+
// a 32-bit signed integer.
31+
INT32 = 4,
32+
// a 32-bit signed integer which use fury var_int32 encoding.
33+
VAR_INT32 = 5,
34+
// a 64-bit signed integer.
35+
INT64 = 6,
36+
// a 64-bit signed integer which use fury PVL encoding.
37+
VAR_INT64 = 7,
38+
// a 64-bit signed integer which use fury SLI encoding.
39+
SLI_INT64 = 8,
40+
// a 16-bit floating point number.
41+
FLOAT16 = 9,
42+
// a 32-bit floating point number.
43+
FLOAT32 = 10,
44+
// a 64-bit floating point number including NaN and Infinity.
45+
FLOAT64 = 11,
46+
// a text string encoded using Latin1/UTF16/UTF-8 encoding.
47+
STRING = 12,
48+
// a data type consisting of a set of named values. Rust enum with
49+
// non-predefined field values are not supported as an enum
50+
ENUM = 13,
51+
// an enum whose value will be serialized as the registered name.
52+
NAMED_ENUM = 14,
53+
// a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't
54+
// have subclasses. Suppose we're
55+
// deserializing `List<SomeClass>`, we can save dynamic serializer dispatch
56+
// since `SomeClass` is morphic(final).
57+
STRUCT = 15,
58+
// a type which is not morphic(not final). i.e. it has subclasses. Suppose
59+
// we're deserializing
60+
// `List<SomeClass>`, we must dispatch serializer dynamically since
61+
// `SomeClass` is polymorphic(non-final).
62+
POLYMORPHIC_STRUCT = 16,
63+
// a morphic(final) type serialized by Fury compatible Struct serializer.
64+
COMPATIBLE_STRUCT = 17,
65+
// a non-morphic(non-final) type serialized by Fury compatible Struct
66+
// serializer.
67+
POLYMORPHIC_COMPATIBLE_STRUCT = 18,
68+
// a `struct` whose type mapping will be encoded as a name.
69+
NAMED_STRUCT = 19,
70+
// a `polymorphic_struct` whose type mapping will be encoded as a name.
71+
NAMED_POLYMORPHIC_STRUCT = 20,
72+
// a `compatible_struct` whose type mapping will be encoded as a name.
73+
NAMED_COMPATIBLE_STRUCT = 21,
74+
// a `polymorphic_compatible_struct` whose type mapping will be encoded as a
75+
// name.
76+
NAMED_POLYMORPHIC_COMPATIBLE_STRUCT = 22,
77+
// a type which will be serialized by a customized serializer.
78+
EXT = 23,
79+
// an `ext` type which is not morphic(not final).
80+
POLYMORPHIC_EXT = 24,
81+
// an `ext` type whose type mapping will be encoded as a name.
82+
NAMED_EXT = 25,
83+
// an `polymorphic_ext` type whose type mapping will be encoded as a name.
84+
NAMED_POLYMORPHIC_EXT = 26,
85+
// a sequence of objects.
86+
LIST = 27,
87+
// an unordered set of unique elements.
88+
SET = 28,
89+
// a map of key-value pairs. Mutable types such as
90+
// `list/map/set/array/tensor/arrow` are not allowed as key of map.
91+
MAP = 29,
92+
// an absolute length of time, independent of any calendar/timezone, as a
93+
// count of nanoseconds.
94+
DURATION = 30,
95+
// a point in time, independent of any calendar/timezone, as a count of
96+
// nanoseconds. The count is relative
97+
// to an epoch at UTC midnight on January 1, 1970.
98+
TIMESTAMP = 31,
99+
// a naive date without timezone. The count is days relative to an epoch at
100+
// UTC midnight on Jan 1, 1970.
101+
LOCAL_DATE = 32,
102+
// exact decimal value represented as an integer value in two's complement.
103+
DECIMAL = 33,
104+
// an variable-length array of bytes.
105+
BINARY = 34,
106+
// a multidimensional array which every sub-array can have different sizes but
107+
// all have same type.
108+
// only allow numeric components. Other arrays will be taken as List. The
109+
// implementation should support the
110+
// interoperability between array and list.
111+
ARRAY = 35,
112+
// one dimensional bool array.
113+
BOOL_ARRAY = 36,
114+
// one dimensional int16 array.
115+
INT8_ARRAY = 37,
116+
// one dimensional int16 array.
117+
INT16_ARRAY = 38,
118+
// one dimensional int32 array.
119+
INT32_ARRAY = 39,
120+
// one dimensional int64 array.
121+
INT64_ARRAY = 40,
122+
// one dimensional half_float_16 array.
123+
FLOAT16_ARRAY = 41,
124+
// one dimensional float32 array.
125+
FLOAT32_ARRAY = 42,
126+
// one dimensional float64 array.
127+
FLOAT64_ARRAY = 43,
128+
// an arrow [record
129+
// batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches)
130+
// object.
131+
ARROW_RECORD_BATCH = 44,
132+
// an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables)
133+
// object.
134+
ARROW_TABLE = 45,
135+
BOUND = 64
136+
};
137+
138+
inline bool IsNamespacedType(int32_t type_id) {
139+
switch (static_cast<TypeId>(type_id)) {
140+
case TypeId::NAMED_ENUM:
141+
case TypeId::NAMED_STRUCT:
142+
case TypeId::NAMED_POLYMORPHIC_STRUCT:
143+
case TypeId::NAMED_COMPATIBLE_STRUCT:
144+
case TypeId::NAMED_POLYMORPHIC_COMPATIBLE_STRUCT:
145+
case TypeId::NAMED_EXT:
146+
case TypeId::NAMED_POLYMORPHIC_EXT:
147+
return true;
148+
default:
149+
return false;
150+
}
151+
}
152+
153+
} // namespace fury

cpp/fury/util/buffer.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "fury/util/bit_util.h"
2929
#include "fury/util/logging.h"
30+
#include "fury/util/status.h"
3031

3132
namespace fury {
3233

@@ -133,6 +134,29 @@ class Buffer {
133134

134135
inline double GetDouble(uint32_t offset) { return Get<double>(offset); }
135136

137+
inline Status GetBytesAsInt64(uint32_t offset, uint32_t length,
138+
int64_t *target) {
139+
if (length == 0) {
140+
*target = 0;
141+
return Status::OK();
142+
}
143+
if (size_ - (offset + 8) > 0) {
144+
uint64_t mask = 0xffffffffffffffff;
145+
uint64_t x = (mask >> (8 - length) * 8);
146+
*target = GetInt64(offset) & x;
147+
} else {
148+
if (size_ - (offset + length) < 0) {
149+
return Status::OutOfBound("buffer out of bound");
150+
}
151+
int64_t result = 0;
152+
for (size_t i = 0; i < length; i++) {
153+
result = result | ((int64_t)(data_[offset + i])) << (i * 8);
154+
}
155+
*target = result;
156+
}
157+
return Status::OK();
158+
}
159+
136160
inline uint32_t PutVarUint32(uint32_t offset, int32_t value) {
137161
if (value >> 7 == 0) {
138162
data_[offset] = (int8_t)value;

cpp/fury/util/buffer_test.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ TEST(Buffer, TestVarUint) {
6565
}
6666
}
6767

68+
TEST(Buffer, TestGetBytesAsInt64) {
69+
std::shared_ptr<Buffer> buffer;
70+
AllocateBuffer(64, &buffer);
71+
buffer->UnsafePut<int32_t>(0, 100);
72+
int64_t result = -1;
73+
EXPECT_TRUE(buffer->GetBytesAsInt64(0, 0, &result).ok());
74+
EXPECT_EQ(result, 0);
75+
EXPECT_TRUE(buffer->GetBytesAsInt64(0, 1, &result).ok());
76+
EXPECT_EQ(result, 100);
77+
}
6878
} // namespace fury
6979

7080
int main(int argc, char **argv) {

cpp/fury/util/logging.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,4 @@ bool FuryLog::IsLevelEnabled(FuryLogLevel log_level) {
111111
return log_level >= fury_severity_threshold;
112112
}
113113

114-
} // namespace fury
114+
} // namespace fury

cpp/fury/util/status.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,12 @@ namespace fury {
8383
enum class StatusCode : char {
8484
OK = 0,
8585
OutOfMemory = 1,
86-
KeyError = 2,
87-
TypeError = 3,
88-
Invalid = 4,
89-
IOError = 5,
90-
UnknownError = 6,
86+
OutOfBound = 2,
87+
KeyError = 3,
88+
TypeError = 4,
89+
Invalid = 5,
90+
IOError = 6,
91+
UnknownError = 7,
9192
};
9293

9394
class Status {
@@ -123,6 +124,10 @@ class Status {
123124
return Status(StatusCode::OutOfMemory, msg);
124125
}
125126

127+
static Status OutOfBound(const std::string &msg) {
128+
return Status(StatusCode::OutOfMemory, msg);
129+
}
130+
126131
static Status KeyError(const std::string &msg) {
127132
return Status(StatusCode::KeyError, msg);
128133
}

docs/guide/xlang_serialization_guide.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ class SomeClass2:
225225

226226
if __name__ == "__main__":
227227
f = pyfury.Fury()
228-
f.register_class(SomeClass1, type_tag="example.SomeClass1")
229-
f.register_class(SomeClass2, type_tag="example.SomeClass2")
228+
f.register_type(SomeClass1, typename="example.SomeClass1")
229+
f.register_type(SomeClass2, typename="example.SomeClass2")
230230
obj1 = SomeClass1(f1=True, f2={-1: 2})
231231
obj = SomeClass2(
232232
f1=obj1,
@@ -444,7 +444,7 @@ class SomeClass:
444444
f3: Dict[str, str]
445445

446446
fury = pyfury.Fury(ref_tracking=True)
447-
fury.register_class(SomeClass, type_tag="example.SomeClass")
447+
fury.register_type(SomeClass, typename="example.SomeClass")
448448
obj = SomeClass()
449449
obj.f2 = {"k1": "v1", "k2": "v2"}
450450
obj.f1, obj.f3 = obj, obj.f2

0 commit comments

Comments
 (0)