Skip to content

Commit 1a7d3b6

Browse files
simplify string load limit
1 parent 030f8df commit 1a7d3b6

File tree

3 files changed

+19
-82
lines changed

3 files changed

+19
-82
lines changed

design/mvp/CanonicalABI.md

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2158,13 +2158,13 @@ of source code units.
21582158
The `MAX_STRING_BYTE_LENGTH` constant ensures that the high bit of a
21592159
string's number of code units is never set, keeping it clear for `UTF16_TAG`.
21602160

2161-
Since this byte length of a string depends on the encoding, we estimate the
2162-
worst case length across all encodings when loading the string and trap if the
2163-
maximum length might be exceeded. Generally the worst case length comes from
2164-
encoding in UTF-16 where byte length could be twice the number of code units.
2165-
But if the original encoding was UTF-16 the byte length may be up to 3 times the
2166-
number of code units when encoding in UTF-8 if there are code points at 2^7 or
2167-
higher.
2161+
Since this byte length of a string depends on the encoding, we additionally
2162+
restrict the total code units to `MAX_STRING_CODE_UNITS = (1 << 28) - 1` when
2163+
loading a string to ensure that it won't exceed the maximum byte length when
2164+
converted to a different encoding. The worst case inflation for string length
2165+
comes in `store_utf16_to_utf8` which may result in 3 bytes per code unit in the
2166+
original encoding, so this limit is low enough to keep strings within the
2167+
maximum length.
21682168
```python
21692169
String = tuple[str, str, int]
21702170

@@ -2175,15 +2175,11 @@ def load_string(cx, ptr) -> String:
21752175

21762176
UTF16_TAG = 1 << 31
21772177

2178-
def worst_case_string_byte_length(string : String):
2179-
(s, encoding, tagged_code_units) = string
2180-
if encoding == 'utf16' or (encoding == 'latin1+utf16' and (tagged_code_units & UTF16_TAG)):
2181-
for code_point in s:
2182-
if ord(code_point) >= 2 ** 7:
2183-
return 3 * (tagged_code_units & ~UTF16_TAG)
2184-
return 2 * (tagged_code_units & ~UTF16_TAG)
2185-
21862178
MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
2179+
MAX_STRING_CODE_UNITS = (1 << 28) - 1
2180+
# The worst case for string byte length comes in store_utf16_to_utf8 where
2181+
# we may end up with 3 bytes for each original code unit.
2182+
assert(MAX_STRING_CODE_UNITS * 3 <= MAX_STRING_BYTE_LENGTH)
21872183

21882184
def load_string_from_range(cx, ptr, tagged_code_units) -> String:
21892185
match cx.opts.string_encoding:
@@ -2211,10 +2207,9 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
22112207
except UnicodeError:
22122208
trap()
22132209

2214-
string = (s, cx.opts.string_encoding, tagged_code_units)
2215-
trap_if(worst_case_string_byte_length(string) > MAX_STRING_BYTE_LENGTH)
2210+
trap_if((tagged_code_units & ~UTF16_TAG) > MAX_STRING_CODE_UNITS)
22162211

2217-
return string
2212+
return (s, cx.opts.string_encoding, tagged_code_units)
22182213
```
22192214

22202215
Error context values are lifted directly from the current component instance's

design/mvp/canonical-abi/definitions.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,15 +1286,11 @@ def load_string(cx, ptr) -> String:
12861286

12871287
UTF16_TAG = 1 << 31
12881288

1289-
def worst_case_string_byte_length(string : String):
1290-
(s, encoding, tagged_code_units) = string
1291-
if encoding == 'utf16' or (encoding == 'latin1+utf16' and (tagged_code_units & UTF16_TAG)):
1292-
for code_point in s:
1293-
if ord(code_point) >= 2 ** 7:
1294-
return 3 * (tagged_code_units & ~UTF16_TAG)
1295-
return 2 * (tagged_code_units & ~UTF16_TAG)
1296-
12971289
MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
1290+
MAX_STRING_CODE_UNITS = (1 << 28) - 1
1291+
# The worst case for string byte length comes in store_utf16_to_utf8 where
1292+
# we may end up with 3 bytes for each original code unit.
1293+
assert(MAX_STRING_CODE_UNITS * 3 <= MAX_STRING_BYTE_LENGTH)
12981294

12991295
def load_string_from_range(cx, ptr, tagged_code_units) -> String:
13001296
match cx.opts.string_encoding:
@@ -1322,10 +1318,9 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
13221318
except UnicodeError:
13231319
trap()
13241320

1325-
string = (s, cx.opts.string_encoding, tagged_code_units)
1326-
trap_if(worst_case_string_byte_length(string) > MAX_STRING_BYTE_LENGTH)
1321+
trap_if((tagged_code_units & ~UTF16_TAG) > MAX_STRING_CODE_UNITS)
13271322

1328-
return string
1323+
return (s, cx.opts.string_encoding, tagged_code_units)
13291324

13301325
def lift_error_context(cx, i):
13311326
errctx = cx.inst.handles.get(i)

design/mvp/canonical-abi/run_tests.py

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -466,58 +466,6 @@ def on_resolve(result):
466466
test_roundtrip(t, v, addr_type=addr_type)
467467

468468

469-
def assert_trap_on_load_string(src_encoding, s, tagged_code_units, encoded):
470-
ptr_offset = 8
471-
memory = bytearray(ptr_offset + len(encoded))
472-
memory[0:4] = int.to_bytes(ptr_offset, 4, 'little')
473-
memory[4:8] = int.to_bytes(tagged_code_units, 4, 'little')
474-
memory[ptr_offset:] = encoded
475-
cx = mk_cx(MemInst(memory, 'i32'), src_encoding)
476-
try:
477-
load(cx, 0, StringType())
478-
fail("expected trap loading {!r} as {}".format(s, src_encoding))
479-
except Trap:
480-
pass
481-
482-
def test_string_byte_length_limit():
483-
saved = definitions.MAX_STRING_BYTE_LENGTH
484-
try:
485-
definitions.MAX_STRING_BYTE_LENGTH = 20
486-
487-
# Loading from UTF-8: 10 bytes will succeed, 11 bytes will trap on load
488-
for dst in encodings:
489-
test_string('utf8', dst, 'helloworld')
490-
assert_trap_on_load_string('utf8', 'hello world', 11, b'hello world')
491-
492-
# Loading from UTF-16 all ASCII: 10 code units will succeed, 11 will trap on
493-
# load
494-
for dst in encodings:
495-
test_string('utf16', dst, 'abcdefghij')
496-
assert_trap_on_load_string('utf16', 'abcdefghijk', 11,
497-
'abcdefghijk'.encode('utf-16-le'))
498-
499-
# UTF-16 non-ASCII: 6 code units will succeed, 7 will trap on load
500-
for dst in encodings:
501-
test_string('utf16', dst, 'ab\u0100def')
502-
assert_trap_on_load_string('utf16', '\u0100abcdef', 7,
503-
'\u0100abcdef'.encode('utf-16-le'))
504-
505-
# Latin1+utf16 (latin1): 10 bytes will succeed, 11 will trap on load
506-
for dst in encodings:
507-
test_string('latin1+utf16', dst, 'helloworld')
508-
assert_trap_on_load_string('latin1+utf16', 'hello world', 11,
509-
b'hello world')
510-
511-
# Latin1+utf16 (utf16 variant, non-ASCII): 6 code units will succeed, 7
512-
# will trap on load
513-
for dst in encodings:
514-
test_string('latin1+utf16', dst, '\u0100abcde')
515-
assert_trap_on_load_string('latin1+utf16', '\u0100abcdef', 7 | UTF16_TAG,
516-
'\u0100abcdef'.encode('utf-16-le'))
517-
518-
finally:
519-
definitions.MAX_STRING_BYTE_LENGTH = saved
520-
521469
def test_list_byte_length_limit():
522470
saved = definitions.MAX_LIST_BYTE_LENGTH
523471
try:
@@ -2931,7 +2879,6 @@ def mk_task(supertask, inst):
29312879

29322880

29332881
test_roundtrips()
2934-
test_string_byte_length_limit()
29352882
test_list_byte_length_limit()
29362883
test_handles()
29372884
test_async_to_async()

0 commit comments

Comments
 (0)