Skip to content

Commit 8d2d124

Browse files
authored
feat(python): Hardcoding metastring into passable parameters (#1987)
<!-- **Thanks for contributing to Fury.** **If this is your first time opening a PR on fury, you can refer to [CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).** Contribution Checklist - The **Apache Fury (incubating)** community has restrictions on the naming of pr titles. You can also find instructions in [CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md). - Fury has a strong focus on performance. If the PR you submit will have an impact on performance, please benchmark it first and provide the benchmark result here. --> ## What does this PR do? In the original MetaString, MetaStringEncoder used hard coding directly to solve the special char1/2 situation, but this was not the best choice. So it's passable, allowing MetaString to select the special char it passes. <!-- Describe the purpose of this PR. --> ## Related issues Close #1983 <!-- Is there any related issue? Please attach here. - #xxxx0 - #xxxx1 - #xxxx2 --> ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fury/issues/new/choose) describing the need to do so and update the document if necessary. --> - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark <!-- When the PR has an impact on performance (if you don't know whether the PR will have an impact on performance, you can submit the PR first, and if it will have impact on performance, the code reviewer will explain it), be sure to attach a benchmark data here. -->
1 parent 1515f94 commit 8d2d124

2 files changed

Lines changed: 122 additions & 37 deletions

File tree

python/pyfury/meta/metastring.py

Lines changed: 92 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,20 @@ class Encoding(Enum):
4848

4949
class MetaString:
5050
def __init__(
51-
self, original: str, encoding: Encoding, encoded_data: bytes, length: int
51+
self,
52+
original: str,
53+
encoding: Encoding,
54+
encoded_data: bytes,
55+
length: int,
56+
special_char1: str = ".",
57+
special_char2: str = "|",
5258
):
5359
self.original = original
5460
self.encoding = encoding
5561
self.encoded_data = encoded_data
5662
self.length = length
63+
self.special_char1 = special_char1
64+
self.special_char2 = special_char2
5765
if self.encoding != Encoding.UTF_8:
5866
self.strip_last_char = (encoded_data[0] & 0x80) != 0
5967
else:
@@ -65,6 +73,17 @@ class MetaStringDecoder:
6573
Decodes MetaString objects back into their original plain text form.
6674
"""
6775

76+
def __init__(self, special_char1: str, special_char2: str):
77+
"""
78+
Creates a MetaStringDecoder with specified special characters used for decoding.
79+
80+
Args:
81+
special_char1 (str): The first special character used for encoding.
82+
special_char2 (str): The second special character used for encoding.
83+
"""
84+
self.special_char1 = special_char1
85+
self.special_char2 = special_char2
86+
6887
def decode(self, encoded_data: bytes, encoding: Encoding) -> str:
6988
"""
7089
Decodes the encoded data using the specified encoding.
@@ -203,9 +222,9 @@ def _decode_lower_upper_digit_special_char(self, char_value: int) -> str:
203222
elif 52 <= char_value <= 61:
204223
return chr(ord("0") + (char_value - 52))
205224
elif char_value == 62:
206-
return "."
225+
return self.special_char1 # Use special_char1 for the encoding
207226
elif char_value == 63:
208-
return "_"
227+
return self.special_char2 # Use special_char2 for the encoding
209228
else:
210229
raise ValueError(
211230
f"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL: {char_value}"
@@ -250,9 +269,16 @@ def _decode_rep_all_to_lower_special(self, data: bytes) -> str:
250269

251270

252271
class MetaStringEncoder:
253-
"""
254-
Encodes plain text strings into MetaString objects with specified encoding mechanisms.
255-
"""
272+
def __init__(self, special_char1: str, special_char2: str):
273+
"""
274+
Creates a MetaStringEncoder with specified special characters used for encoding.
275+
276+
Args:
277+
special_char1 (str): The first special character used in custom encoding.
278+
special_char2 (str): The second special character used in custom encoding.
279+
"""
280+
self.special_char1 = special_char1
281+
self.special_char2 = special_char2
256282

257283
def encode(self, input_string: str) -> MetaString:
258284
"""
@@ -270,7 +296,14 @@ def encode(self, input_string: str) -> MetaString:
270296
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."
271297

272298
if not input_string:
273-
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
299+
return MetaString(
300+
input_string,
301+
Encoding.UTF_8,
302+
bytes(),
303+
0,
304+
self.special_char1,
305+
self.special_char2,
306+
)
274307

275308
encoding = self.compute_encoding(input_string)
276309
return self.encode_with_encoding(input_string, encoding)
@@ -292,29 +325,67 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr
292325
), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed."
293326

294327
if not input_string:
295-
return MetaString(input_string, Encoding.UTF_8, bytes(), 0)
328+
return MetaString(
329+
input_string,
330+
Encoding.UTF_8,
331+
bytes(),
332+
0,
333+
self.special_char1,
334+
self.special_char2,
335+
)
296336

297337
length = len(input_string)
298338
if encoding == Encoding.LOWER_SPECIAL:
299339
encoded_data = self._encode_lower_special(input_string)
300-
return MetaString(input_string, encoding, encoded_data, length * 5)
340+
return MetaString(
341+
input_string,
342+
encoding,
343+
encoded_data,
344+
length * 5,
345+
self.special_char1,
346+
self.special_char2,
347+
)
301348
elif encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL:
302349
encoded_data = self._encode_lower_upper_digit_special(input_string)
303-
return MetaString(input_string, encoding, encoded_data, length * 6)
350+
return MetaString(
351+
input_string,
352+
encoding,
353+
encoded_data,
354+
length * 6,
355+
self.special_char1,
356+
self.special_char2,
357+
)
304358
elif encoding == Encoding.FIRST_TO_LOWER_SPECIAL:
305359
encoded_data = self._encode_first_to_lower_special(input_string)
306-
return MetaString(input_string, encoding, encoded_data, length * 5)
360+
return MetaString(
361+
input_string,
362+
encoding,
363+
encoded_data,
364+
length * 5,
365+
self.special_char1,
366+
self.special_char2,
367+
)
307368
elif encoding == Encoding.ALL_TO_LOWER_SPECIAL:
308369
chars = list(input_string)
309370
upper_count = sum(1 for c in chars if c.isupper())
310371
encoded_data = self._encode_all_to_lower_special(chars)
311372
return MetaString(
312-
input_string, encoding, encoded_data, (upper_count + length) * 5
373+
input_string,
374+
encoding,
375+
encoded_data,
376+
(upper_count + length) * 5,
377+
self.special_char1,
378+
self.special_char2,
313379
)
314380
else:
315381
encoded_data = bytes(input_string, "utf-8")
316382
return MetaString(
317-
input_string, Encoding.UTF_8, encoded_data, len(encoded_data) * 8
383+
input_string,
384+
Encoding.UTF_8,
385+
encoded_data,
386+
len(encoded_data) * 8,
387+
self.special_char1,
388+
self.special_char2,
318389
)
319390

320391
def compute_encoding(self, input_string: str) -> Encoding:
@@ -363,7 +434,12 @@ def _compute_statistics(self, chars: List[str]) -> Statistics:
363434
upper_count = 0
364435
for c in chars:
365436
if can_lower_upper_digit_special_encoded:
366-
if not (c.islower() or c.isupper() or c.isdigit() or c in {".", "_"}):
437+
if not (
438+
c.islower()
439+
or c.isupper()
440+
or c.isdigit()
441+
or c in {self.special_char1, self.special_char2}
442+
):
367443
can_lower_upper_digit_special_encoded = False
368444
if can_lower_special_encoded:
369445
if not (c.islower() or c in {".", "_", "$", "|"}):
@@ -500,9 +576,9 @@ def _char_to_value(self, c: str, bits_per_char: int) -> int:
500576
return 26 + (ord(c) - ord("A"))
501577
elif "0" <= c <= "9":
502578
return 52 + (ord(c) - ord("0"))
503-
elif c == ".":
579+
elif c == self.special_char1:
504580
return 62
505-
elif c == "_":
581+
elif c == self.special_char2:
506582
return 63
507583
else:
508584
raise ValueError(

python/pyfury/tests/test_metastring.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424

2525

2626
def test_encode_metastring_lower_special():
27-
encoder = MetaStringEncoder()
28-
decoder = MetaStringDecoder()
27+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
28+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
29+
30+
# Test for encoding and decoding
2931
encoded = encoder._encode_lower_special("abc_def")
3032
assert len(encoded) == 5
3133
assert len(encoder.encode("org.apache.fury.benchmark.data").encoded_data) == 19
@@ -41,10 +43,12 @@ def test_encode_metastring_lower_special():
4143

4244

4345
def test_encode_metastring_lower_upper_digit_special():
44-
encoder = MetaStringEncoder()
46+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
47+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
48+
49+
# Test for encoding and decoding
4550
encoded = encoder._encode_lower_upper_digit_special("ExampleInput123")
4651
assert len(encoded) == 12
47-
decoder = MetaStringDecoder()
4852
decoded = decoder.decode(encoded, Encoding.LOWER_UPPER_DIGIT_SPECIAL)
4953
assert decoded == "ExampleInput123"
5054

@@ -73,25 +77,26 @@ def create_string(length):
7377

7478

7579
def test_metastring():
80+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
81+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
7682

77-
encoder = MetaStringEncoder()
7883
for i in range(1, 128):
7984
try:
8085
string = create_string(i)
8186
metastring = encoder.encode(string)
8287
assert metastring.encoding != Encoding.UTF_8
8388
assert metastring.original == string
8489

85-
decoder = MetaStringDecoder()
8690
new_string = decoder.decode(metastring.encoded_data, metastring.encoding)
8791
assert new_string == string
8892
except Exception as e:
8993
pytest.fail(f"Failed at {i} with exception: {str(e)}")
9094

9195

9296
def test_encode_empty_string():
93-
encoder = MetaStringEncoder()
94-
decoder = MetaStringDecoder()
97+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
98+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
99+
95100
for encoding in [
96101
Encoding.LOWER_SPECIAL,
97102
Encoding.LOWER_UPPER_DIGIT_SPECIAL,
@@ -106,16 +111,17 @@ def test_encode_empty_string():
106111

107112

108113
def test_encode_characters_outside_of_lower_special():
109-
encoder = MetaStringEncoder()
114+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
110115

111116
test_string = "abcdefABCDEF1234!@#"
112117
metastring = encoder.encode(test_string)
113118
assert metastring.encoding == Encoding.UTF_8
114119

115120

116121
def test_all_to_upper_special_encoding():
117-
encoder = MetaStringEncoder()
118-
decoder = MetaStringDecoder()
122+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
123+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
124+
119125
test_string = "ABC_DEF"
120126
metastring = encoder.encode(test_string)
121127
assert metastring.encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL
@@ -124,8 +130,9 @@ def test_all_to_upper_special_encoding():
124130

125131

126132
def test_first_to_lower_special_encoding():
127-
encoder = MetaStringEncoder()
128-
decoder = MetaStringDecoder()
133+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
134+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
135+
129136
test_string = "Aabcdef"
130137
metastring = encoder.encode(test_string)
131138
assert metastring.encoding == Encoding.FIRST_TO_LOWER_SPECIAL
@@ -134,8 +141,9 @@ def test_first_to_lower_special_encoding():
134141

135142

136143
def test_utf8_encoding():
137-
encoder = MetaStringEncoder()
138-
decoder = MetaStringDecoder()
144+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
145+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
146+
139147
test_string = "你好,世界" # Non-Latin characters
140148
metastring = encoder.encode(test_string)
141149
assert metastring.encoding == Encoding.UTF_8
@@ -144,7 +152,7 @@ def test_utf8_encoding():
144152

145153

146154
def test_strip_last_char():
147-
encoder = MetaStringEncoder()
155+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
148156

149157
test_string = "abc" # encoded as 1|00000|00, 001|00010, exactly two bytes
150158
encoded_metastring = encoder.encode(test_string)
@@ -156,8 +164,9 @@ def test_strip_last_char():
156164

157165

158166
def test_empty_string():
159-
encoder = MetaStringEncoder()
160-
decoder = MetaStringDecoder()
167+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
168+
decoder = MetaStringDecoder(special_char1=".", special_char2="_")
169+
161170
metastring = encoder.encode("")
162171
assert metastring.encoded_data == bytes()
163172

@@ -166,7 +175,7 @@ def test_empty_string():
166175

167176

168177
def test_ascii_encoding():
169-
encoder = MetaStringEncoder()
178+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
170179

171180
test_string = "asciiOnly"
172181
encoded_metastring = encoder.encode(test_string)
@@ -175,15 +184,15 @@ def test_ascii_encoding():
175184

176185

177186
def test_non_ascii_encoding():
178-
encoder = MetaStringEncoder()
187+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
179188

180189
test_string = "こんにちは" # Non-ASCII string
181190
encoded_metastring = encoder.encode(test_string)
182191
assert encoded_metastring.encoding == Encoding.UTF_8
183192

184193

185194
def test_non_ascii_encoding_and_non_utf8():
186-
encoder = MetaStringEncoder()
195+
encoder = MetaStringEncoder(special_char1=".", special_char2="_")
187196

188197
non_ascii_string = "こんにちは" # Non-ASCII string
189198

0 commit comments

Comments
 (0)