simplify string load limit

adambratschikaye · adambratschikaye · commit 1a7d3b67948c · 2026-03-30T12:23:31.000Z
diff --git a/design/mvp/CanonicalABI.md b/design/mvp/CanonicalABI.md
@@ -2158,13 +2158,13 @@ of source code units.
 The `MAX_STRING_BYTE_LENGTH` constant ensures that the high bit of a
 string's number of code units is never set, keeping it clear for `UTF16_TAG`.
 
-Since this byte length of a string depends on the encoding, we estimate the
-worst case length across all encodings when loading the string and trap if the
-maximum length might be exceeded. Generally the worst case length comes from
-encoding in UTF-16 where byte length could be twice the number of code units.
-But if the original encoding was UTF-16 the byte length may be up to 3 times the
-number of code units when encoding in UTF-8 if there are code points at 2^7 or
-higher.
+Since this byte length of a string depends on the encoding, we additionally
+restrict the total code units to `MAX_STRING_CODE_UNITS = (1 << 28) - 1` when
+loading a string to ensure that it won't exceed the maximum byte length when
+converted to a different encoding. The worst case inflation for string length
+comes in `store_utf16_to_utf8` which may result in 3 bytes per code unit in the
+original encoding, so this limit is low enough to keep strings within the
+maximum length.
 ```python
 String = tuple[str, str, int]
 
@@ -2175,15 +2175,11 @@ def load_string(cx, ptr) -> String:
 
 UTF16_TAG = 1 << 31
 
-def worst_case_string_byte_length(string : String):
-  (s, encoding, tagged_code_units) = string
-  if encoding == 'utf16' or (encoding == 'latin1+utf16' and (tagged_code_units & UTF16_TAG)):
-    for code_point in s:
-      if ord(code_point) >= 2 ** 7:
-        return 3 * (tagged_code_units & ~UTF16_TAG)
-  return 2 * (tagged_code_units & ~UTF16_TAG)
-
 MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
+MAX_STRING_CODE_UNITS = (1 << 28) - 1
+# The worst case for string byte length comes in store_utf16_to_utf8 where
+# we may end up with 3 bytes for each original code unit.
+assert(MAX_STRING_CODE_UNITS * 3 <= MAX_STRING_BYTE_LENGTH)
 
 def load_string_from_range(cx, ptr, tagged_code_units) -> String:
   match cx.opts.string_encoding:
@@ -2211,10 +2207,9 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
   except UnicodeError:
     trap()
 
-  string = (s, cx.opts.string_encoding, tagged_code_units)
-  trap_if(worst_case_string_byte_length(string) > MAX_STRING_BYTE_LENGTH)
+  trap_if((tagged_code_units & ~UTF16_TAG) > MAX_STRING_CODE_UNITS)
 
-  return string
+  return (s, cx.opts.string_encoding, tagged_code_units)
 ```
 
 Error context values are lifted directly from the current component instance's
diff --git a/design/mvp/canonical-abi/definitions.py b/design/mvp/canonical-abi/definitions.py
@@ -1286,15 +1286,11 @@ def load_string(cx, ptr) -> String:
 
 UTF16_TAG = 1 << 31
 
-def worst_case_string_byte_length(string : String):
-  (s, encoding, tagged_code_units) = string
-  if encoding == 'utf16' or (encoding == 'latin1+utf16' and (tagged_code_units & UTF16_TAG)):
-    for code_point in s:
-      if ord(code_point) >= 2 ** 7:
-        return 3 * (tagged_code_units & ~UTF16_TAG)
-  return 2 * (tagged_code_units & ~UTF16_TAG)
-
 MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
+MAX_STRING_CODE_UNITS = (1 << 28) - 1
+# The worst case for string byte length comes in store_utf16_to_utf8 where
+# we may end up with 3 bytes for each original code unit.
+assert(MAX_STRING_CODE_UNITS * 3 <= MAX_STRING_BYTE_LENGTH)
 
 def load_string_from_range(cx, ptr, tagged_code_units) -> String:
   match cx.opts.string_encoding:
@@ -1322,10 +1318,9 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
   except UnicodeError:
     trap()
 
-  string = (s, cx.opts.string_encoding, tagged_code_units)
-  trap_if(worst_case_string_byte_length(string) > MAX_STRING_BYTE_LENGTH)
+  trap_if((tagged_code_units & ~UTF16_TAG) > MAX_STRING_CODE_UNITS)
 
-  return string
+  return (s, cx.opts.string_encoding, tagged_code_units)
 
 def lift_error_context(cx, i):
   errctx = cx.inst.handles.get(i)
diff --git a/design/mvp/canonical-abi/run_tests.py b/design/mvp/canonical-abi/run_tests.py
@@ -466,58 +466,6 @@ def on_resolve(result):
       test_roundtrip(t, v, addr_type=addr_type)
 
 
-def assert_trap_on_load_string(src_encoding, s, tagged_code_units, encoded):
-  ptr_offset = 8
-  memory = bytearray(ptr_offset + len(encoded))
-  memory[0:4] = int.to_bytes(ptr_offset, 4, 'little')
-  memory[4:8] = int.to_bytes(tagged_code_units, 4, 'little')
-  memory[ptr_offset:] = encoded
-  cx = mk_cx(MemInst(memory, 'i32'), src_encoding)
-  try:
-    load(cx, 0, StringType())
-    fail("expected trap loading {!r} as {}".format(s, src_encoding))
-  except Trap:
-    pass
-
-def test_string_byte_length_limit():
-  saved = definitions.MAX_STRING_BYTE_LENGTH
-  try:
-    definitions.MAX_STRING_BYTE_LENGTH = 20
-
-    # Loading from UTF-8: 10 bytes will succeed, 11 bytes will trap on load
-    for dst in encodings:
-      test_string('utf8', dst, 'helloworld')
-    assert_trap_on_load_string('utf8', 'hello world', 11, b'hello world')
-
-    # Loading from UTF-16 all ASCII: 10 code units will succeed, 11 will trap on
-    # load
-    for dst in encodings:
-      test_string('utf16', dst, 'abcdefghij')
-    assert_trap_on_load_string('utf16', 'abcdefghijk', 11,
-                        'abcdefghijk'.encode('utf-16-le'))
-
-    # UTF-16 non-ASCII: 6 code units will succeed, 7 will trap on load
-    for dst in encodings:
-      test_string('utf16', dst, 'ab\u0100def')
-    assert_trap_on_load_string('utf16', '\u0100abcdef', 7,
-                        '\u0100abcdef'.encode('utf-16-le'))
-
-    # Latin1+utf16 (latin1): 10 bytes will succeed, 11 will trap on load
-    for dst in encodings:
-      test_string('latin1+utf16', dst, 'helloworld')
-    assert_trap_on_load_string('latin1+utf16', 'hello world', 11,
-                        b'hello world')
-
-    # Latin1+utf16 (utf16 variant, non-ASCII): 6 code units will succeed, 7
-    # will trap on load
-    for dst in encodings:
-      test_string('latin1+utf16', dst, '\u0100abcde')
-    assert_trap_on_load_string('latin1+utf16', '\u0100abcdef', 7 | UTF16_TAG,
-                        '\u0100abcdef'.encode('utf-16-le'))
-
-  finally:
-    definitions.MAX_STRING_BYTE_LENGTH = saved
-
 def test_list_byte_length_limit():
   saved = definitions.MAX_LIST_BYTE_LENGTH
   try:
@@ -2931,7 +2879,6 @@ def mk_task(supertask, inst):
 
 
 test_roundtrips()
-test_string_byte_length_limit()
 test_list_byte_length_limit()
 test_handles()
 test_async_to_async()