Skip to content

Commit ef6f68e

Browse files
authored
feat(python): type meta encoding for python (#2509)
## Why? type forward/backward compatible serialization is critical for online service which different service update their data schema and deploy at different time. The schema is in an inconsistent state. meta shared encoding can address this : https://fory.apache.org/docs/specification/fory_xlang_serialization_spec#type-def ## What does this PR do? Add type meta encoding for python to support type forward/backward compatible serialization. Things not finished in this PR: - not null field support - generate serializer from type meta - meta share mode Those feature will be implemented in follow-up PRs. ## Related issues #1938 #2160 #2278 ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fory/issues/new/choose) describing the need to do so and update the document if necessary. Delete section if not applicable. --> - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. Delete section if not applicable. -->
1 parent 7a2700f commit ef6f68e

13 files changed

Lines changed: 887 additions & 9 deletions

python/pyfory/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
Language,
2222
)
2323

24+
PYTHON = Language.PYTHON
25+
XLANG = Language.XLANG
26+
2427
try:
2528
from pyfory._serialization import ENABLE_FORY_CYTHON_SERIALIZATION
2629
except ImportError:

python/pyfory/_registry.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
ObjectSerializer,
6767
)
6868
from pyfory.meta.metastring import MetaStringEncoder, MetaStringDecoder
69+
from pyfory.meta.meta_compressor import DeflaterMetaCompressor
6970
from pyfory.type import (
7071
TypeId,
7172
Int8Type,
@@ -154,6 +155,7 @@ class TypeResolver:
154155
"namespace_decoder",
155156
"typename_encoder",
156157
"typename_decoder",
158+
"meta_compressor",
157159
"require_registration",
158160
"metastring_resolver",
159161
"language",
@@ -182,6 +184,7 @@ def __init__(self, fory):
182184
self.namespace_decoder = MetaStringDecoder(".", "_")
183185
self.typename_encoder = MetaStringEncoder("$", "_")
184186
self.typename_decoder = MetaStringDecoder("$", "_")
187+
self.meta_compressor = DeflaterMetaCompressor()
185188

186189
def initialize(self):
187190
self._initialize_xlang()
@@ -576,6 +579,17 @@ def read_typeinfo(self, buffer):
576579
else:
577580
return self._type_id_to_typeinfo[type_id]
578581

582+
def get_typeinfo_by_id(self, type_id):
583+
"""Get typeinfo by type_id."""
584+
return self._type_id_to_typeinfo[type_id]
585+
586+
def get_typeinfo_by_name(self, namespace, typename):
587+
"""Get typeinfo by namespace and typename."""
588+
return self._named_type_to_typeinfo.get((namespace, typename))
589+
590+
def get_meta_compressor(self):
591+
return self.meta_compressor
592+
579593
def reset(self):
580594
pass
581595

python/pyfory/_serialization.pyx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,15 @@ cdef class TypeResolver:
570570
raise ValueError(f"Unexpected type_id {type_id}")
571571
typeinfo = <TypeInfo> typeinfo_ptr
572572
return typeinfo
573+
574+
def get_typeinfo_by_id(self, type_id):
575+
return self._resolver.get_typeinfo_by_id(type_id=type_id)
576+
577+
def get_typeinfo_by_name(self, namespace, typename):
578+
return self._resolver.get_typeinfo_by_name(namespace=namespace, typename=typename)
579+
580+
def get_meta_compressor(self):
581+
return self._resolver.get_meta_compressor()
573582

574583
cpdef inline reset(self):
575584
pass

python/pyfory/_struct.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import datetime
1919
import enum
2020
import logging
21+
import typing
2122

2223
from pyfory.type import (
2324
TypeVisitor,
@@ -166,7 +167,7 @@ def numeric_sorter(item):
166167
map_types = sorted(map_types, key=sorter)
167168
other_types = sorted(other_types, key=sorter)
168169
all_types = boxed_types + final_types + other_types + collection_types + map_types
169-
return [t[1] for t in all_types], [t[2] for t in all_types]
170+
return [t[2] for t in all_types], [t[1] for t in all_types]
170171

171172

172173
class StructHashVisitor(TypeVisitor):
@@ -221,3 +222,49 @@ def _compute_field_hash(hash_, id_):
221222

222223
def get_hash(self):
223224
return self._hash
225+
226+
227+
class StructTypeIdVisitor(TypeVisitor):
228+
def __init__(
229+
self,
230+
fory,
231+
):
232+
self.fory = fory
233+
234+
def visit_list(self, field_name, elem_type, types_path=None):
235+
# Infer type recursively for type such as List[Dict[str, str]]
236+
elem_ids = infer_field("item", elem_type, self, types_path=types_path)
237+
return TypeId.LIST, elem_ids
238+
239+
def visit_dict(self, field_name, key_type, value_type, types_path=None):
240+
# Infer type recursively for type such as Dict[str, Dict[str, str]]
241+
key_ids = infer_field("key", key_type, self, types_path=types_path)
242+
value_ids = infer_field("value", value_type, self, types_path=types_path)
243+
return TypeId.MAP, key_ids, value_ids
244+
245+
def visit_customized(self, field_name, type_, types_path=None):
246+
return None, None
247+
248+
def visit_other(self, field_name, type_, types_path=None):
249+
from pyfory.serializer import PickleSerializer # Local import
250+
251+
if is_subclass(type_, enum.Enum):
252+
return self.fory.type_resolver.get_typeinfo(type_).type_id
253+
if type_ not in basic_types and not is_py_array_type(type_):
254+
return None, None
255+
typeinfo = self.fory.type_resolver.get_typeinfo(type_)
256+
assert not isinstance(typeinfo.serializer, (PickleSerializer,))
257+
return [typeinfo.type_id]
258+
259+
260+
def get_field_names(clz, type_hints=None):
261+
if hasattr(clz, "__dict__"):
262+
# Regular object with __dict__
263+
# We can't know the fields without an instance, so we rely on type hints
264+
if type_hints is None:
265+
type_hints = typing.get_type_hints(clz)
266+
return sorted(type_hints.keys())
267+
elif hasattr(clz, "__slots__"):
268+
# Object with __slots__
269+
return sorted(clz.__slots__)
270+
return []

python/pyfory/_util.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ cdef class Buffer:
9494

9595
cpdef inline write_bool(self, c_bool value)
9696

97+
cpdef inline write_uint8(self, uint8_t value)
98+
9799
cpdef inline write_int8(self, int8_t value)
98100

99101
cpdef inline write_int16(self, int16_t value)

python/pyfory/_util.pyx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,11 @@ cdef class Buffer:
167167
self.grow(<int32_t>1)
168168
(<c_bool *>(self._c_address + self.writer_index))[0] = value
169169
self.writer_index += <int32_t>1
170+
171+
cpdef inline write_uint8(self, uint8_t value):
172+
self.grow(<int32_t>1)
173+
(<uint8_t *>(self._c_address + self.writer_index))[0] = value
174+
self.writer_index += <int32_t>1
170175

171176
cpdef inline write_int8(self, int8_t value):
172177
self.grow(<int32_t>1)

python/pyfory/meta/metastring.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr
372372
self.special_char2,
373373
)
374374

375-
def compute_encoding(self, input_string: str) -> Encoding:
375+
def compute_encoding(self, input_string: str, encodings: List[Encoding] = None) -> Encoding:
376376
"""
377377
Determines the encoding type of the input string.
378378
@@ -384,23 +384,28 @@ def compute_encoding(self, input_string: str) -> Encoding:
384384
"""
385385
if not input_string:
386386
return Encoding.LOWER_SPECIAL
387+
if encodings is None:
388+
encodings = list(Encoding.__members__.values())
387389

388390
chars = list(input_string)
389391
statistics = self._compute_statistics(chars)
390-
if statistics.can_lower_special_encoded:
392+
if statistics.can_lower_special_encoded and Encoding.LOWER_SPECIAL in encodings:
391393
return Encoding.LOWER_SPECIAL
392-
elif statistics.can_lower_upper_digit_special_encoded:
394+
elif statistics.can_lower_upper_digit_special_encoded and Encoding.LOWER_UPPER_DIGIT_SPECIAL in encodings:
393395
if statistics.digit_count != 0:
394396
return Encoding.LOWER_UPPER_DIGIT_SPECIAL
395397
else:
396398
upper_count = statistics.upper_count
397399
if upper_count == 1 and chars[0].isupper():
398400
return Encoding.FIRST_TO_LOWER_SPECIAL
399-
if (len(chars) + upper_count) * 5 < len(chars) * 6:
401+
if (len(chars) + upper_count) * 5 < len(chars) * 6 and Encoding.ALL_TO_LOWER_SPECIAL in encodings:
400402
return Encoding.ALL_TO_LOWER_SPECIAL
401403
else:
402-
return Encoding.LOWER_UPPER_DIGIT_SPECIAL
403-
return Encoding.UTF_8
404+
if Encoding.LOWER_UPPER_DIGIT_SPECIAL in encodings:
405+
return Encoding.LOWER_UPPER_DIGIT_SPECIAL
406+
if Encoding.UTF_8 in encodings:
407+
return Encoding.UTF_8
408+
raise ValueError(f"No encoding found for string: {input_string}, encodings: {encodings}")
404409

405410
def _compute_statistics(self, chars: List[str]) -> Statistics:
406411
"""

0 commit comments

Comments
 (0)