Skip to content

Commit 3141860

Browse files
authored
feat(python): meta share mode for pyfory compatible serialization (#2593)
## Why? implement meta share mode for pyfory, so the struct can add/delete fields and have inconsistent schema bettween serialization and deserialization ## What does this PR do? <!-- Describe the details of this PR. --> ## Related issues #2509 #1938 #2160 #2278 ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fory/issues/new/choose) describing the need to do so and update the document if necessary. Delete section if not applicable. --> - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. Delete section if not applicable. -->
1 parent 67c0280 commit 3141860

20 files changed

Lines changed: 830 additions & 226 deletions

ci/format.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,11 @@ format_files() {
125125

126126
format_all_scripts() {
127127
echo "$(date)" "Ruff format...."
128-
git ls-files -- '*.py' '*.pyx' '*.pxd' '*.pxi' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
128+
git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
129129
ruff format
130130

131131
echo "$(date)" "Ruff check...."
132-
git ls-files -- '*.py' '*.pyx' '*.pxd' '*.pxi' "${GIT_LS_EXCLUDES[@]}" | xargs \
132+
git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs \
133133
ruff check --fix
134134
}
135135

@@ -193,10 +193,10 @@ format_changed() {
193193
# exist on both branches.
194194
MERGEBASE="$(git merge-base origin/main HEAD)"
195195

196-
if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then
197-
git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \
196+
if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then
197+
git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
198198
ruff format
199-
git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \
199+
git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
200200
ruff check --fix
201201
fi
202202

python/pyfory/_fory.py

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class Fory:
9898
__slots__ = (
9999
"language",
100100
"is_py",
101+
"compatible",
101102
"ref_tracking",
102103
"ref_resolver",
103104
"type_resolver",
@@ -113,13 +114,13 @@ class Fory:
113114
"_unsupported_objects",
114115
"_peer_language",
115116
)
116-
serialization_context: "SerializationContext"
117117

118118
def __init__(
119119
self,
120120
language=Language.PYTHON,
121121
ref_tracking: bool = False,
122122
require_type_registration: bool = True,
123+
compatible: bool = False,
123124
):
124125
"""
125126
:param require_type_registration:
@@ -130,10 +131,14 @@ def __init__(
130131
Do not disable type registration if you can't ensure your environment are
131132
*indeed secure*. We are not responsible for security risks if
132133
you disable this option.
134+
:param compatible:
135+
Whether to enable compatible mode for cross-language serialization.
136+
When enabled, type forward/backward compatibility for struct fields will be enabled.
133137
"""
134138
self.language = language
135139
self.is_py = language == Language.PYTHON
136140
self.require_type_registration = _ENABLE_TYPE_REGISTRATION_FORCIBLY or require_type_registration
141+
self.compatible = compatible
137142
self.ref_tracking = ref_tracking
138143
if self.ref_tracking:
139144
self.ref_resolver = MapRefResolver()
@@ -143,9 +148,11 @@ def __init__(
143148
from pyfory._registry import TypeResolver
144149

145150
self.metastring_resolver = MetaStringResolver()
146-
self.type_resolver = TypeResolver(self)
151+
self.type_resolver = TypeResolver(self, meta_share=compatible)
147152
self.type_resolver.initialize()
148-
self.serialization_context = SerializationContext()
153+
from pyfory._serialization import SerializationContext
154+
155+
self.serialization_context = SerializationContext(scoped_meta_share_enabled=compatible)
149156
self.buffer = Buffer.allocate(32)
150157
if not require_type_registration:
151158
warnings.warn(
@@ -255,10 +262,26 @@ def _serialize(
255262
set_bit(buffer, mask_index, 3)
256263
else:
257264
clear_bit(buffer, mask_index, 3)
265+
# Reserve space for type definitions offset, similar to Java implementation
266+
type_defs_offset_pos = None
267+
if self.serialization_context.scoped_meta_share_enabled:
268+
type_defs_offset_pos = buffer.writer_index
269+
buffer.write_int32(-1) # Reserve 4 bytes for type definitions offset
270+
258271
if self.language == Language.PYTHON:
259272
self.serialize_ref(buffer, obj)
260273
else:
261274
self.xserialize_ref(buffer, obj)
275+
276+
# Write type definitions at the end, similar to Java implementation
277+
if self.serialization_context.scoped_meta_share_enabled:
278+
meta_context = self.serialization_context.meta_context
279+
if meta_context is not None and len(meta_context.get_writing_type_defs()) > 0:
280+
# Update the offset to point to current position
281+
current_pos = buffer.writer_index
282+
buffer.put_int32(type_defs_offset_pos, current_pos - type_defs_offset_pos - 4)
283+
self.type_resolver.write_type_defs(buffer)
284+
262285
self.reset_write()
263286
if buffer is not self.buffer:
264287
return buffer
@@ -369,6 +392,20 @@ def _deserialize(
369392
self._buffers = iter(buffers)
370393
else:
371394
assert buffers is None, "buffers should be null when the serialized stream is produced with buffer_callback null."
395+
396+
# Read type definitions at the start, similar to Java implementation
397+
if self.serialization_context.scoped_meta_share_enabled:
398+
relative_type_defs_offset = buffer.read_int32()
399+
if relative_type_defs_offset != -1:
400+
# Save current reader position
401+
current_reader_index = buffer.reader_index
402+
# Jump to type definitions
403+
buffer.reader_index = current_reader_index + relative_type_defs_offset
404+
# Read type definitions
405+
self.type_resolver.read_type_defs(buffer)
406+
# Jump back to continue with object deserialization
407+
buffer.reader_index = current_reader_index
408+
372409
if is_target_x_lang:
373410
obj = self.xdeserialize_ref(buffer)
374411
else:
@@ -470,7 +507,7 @@ def read_ref_pyobject(self, buffer):
470507
def reset_write(self):
471508
self.ref_resolver.reset_write()
472509
self.type_resolver.reset_write()
473-
self.serialization_context.reset()
510+
self.serialization_context.reset_write()
474511
self.metastring_resolver.reset_write()
475512
self.pickler.clear_memo()
476513
self._buffer_callback = None
@@ -479,7 +516,7 @@ def reset_write(self):
479516
def reset_read(self):
480517
self.ref_resolver.reset_read()
481518
self.type_resolver.reset_read()
482-
self.serialization_context.reset()
519+
self.serialization_context.reset_read()
483520
self.metastring_resolver.reset_write()
484521
self.unpickler = None
485522
self._buffers = None
@@ -490,36 +527,6 @@ def reset(self):
490527
self.reset_read()
491528

492529

493-
class SerializationContext:
494-
"""
495-
A context is used to add some context-related information, so that the
496-
serializers can setup relation between serializing different objects.
497-
The context will be reset after finished serializing/deserializing the
498-
object tree.
499-
"""
500-
501-
__slots__ = ("objects",)
502-
503-
def __init__(self):
504-
self.objects = dict()
505-
506-
def add(self, key, obj):
507-
self.objects[id(key)] = obj
508-
509-
def __contains__(self, key):
510-
return id(key) in self.objects
511-
512-
def __getitem__(self, key):
513-
return self.objects[id(key)]
514-
515-
def get(self, key):
516-
return self.objects.get(id(key))
517-
518-
def reset(self):
519-
if len(self.objects) > 0:
520-
self.objects.clear()
521-
522-
523530
_ENABLE_TYPE_REGISTRATION_FORCIBLY = os.getenv("ENABLE_TYPE_REGISTRATION_FORCIBLY", "0") in {
524531
"1",
525532
"true",

0 commit comments

Comments
 (0)