Skip to content

Commit 0270ee3

Browse files
revert max string length to a const
1 parent e06bda6 commit 0270ee3

File tree

3 files changed

+46
-45
lines changed

3 files changed

+46
-45
lines changed

design/mvp/CanonicalABI.md

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,10 +2123,13 @@ def convert_i32_to_char(cx, i):
21232123
```
21242124

21252125
Strings are loaded from two pointer-sized values: a pointer (offset in linear
2126-
memory) and a number of [code units]. There are three supported string
2127-
encodings in [`canonopt`]: [UTF-8], [UTF-16] and `latin1+utf16`. This last
2128-
option allows a *dynamic* choice between [Latin-1] and UTF-16, indicated by
2129-
the high bit of the second pointer-sized value. String values include their
2126+
memory) and a number of [code units]. There are three supported string encodings
2127+
in [`canonopt`]: [UTF-8], [UTF-16] and `latin1+utf16`. This last option allows a
2128+
*dynamic* choice between [Latin-1] and UTF-16, indicated by the 32nd bit of the
2129+
second pointer-sized value. The length of a string is limited so that the number
2130+
of code units fits in 31 bits (leaving the 32nd bit free as the flag). This
2131+
maximum length is enforced even on 64-bit memories to ensure they don't define
2132+
interfaces which 32-bit components couldn't handle. String values include their
21302133
original encoding and length in tagged code units as a "hint" that enables
21312134
`store_string` (defined below) to make better up-front allocation size choices
21322135
in many cases. Thus, the value produced by `load_string` isn't simply a Python
@@ -2140,8 +2143,7 @@ def load_string(cx, ptr) -> String:
21402143
tagged_code_units = load_int(cx, ptr + cx.opts.memory.ptr_size(), cx.opts.memory.ptr_size())
21412144
return load_string_from_range(cx, begin, tagged_code_units)
21422145

2143-
def utf16_tag(opts):
2144-
return 1 << (opts.memory.ptr_size() * 8 - 1)
2146+
UTF16_TAG = 1 << 31
21452147

21462148
def load_string_from_range(cx, ptr, tagged_code_units) -> String:
21472149
match cx.opts.string_encoding:
@@ -2155,8 +2157,8 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
21552157
encoding = 'utf-16-le'
21562158
case 'latin1+utf16':
21572159
alignment = 2
2158-
if bool(tagged_code_units & utf16_tag(cx.opts)):
2159-
byte_length = 2 * (tagged_code_units ^ utf16_tag(cx.opts))
2160+
if bool(tagged_code_units & UTF16_TAG):
2161+
byte_length = 2 * (tagged_code_units ^ UTF16_TAG)
21602162
encoding = 'utf-16-le'
21612163
else:
21622164
byte_length = tagged_code_units
@@ -2431,7 +2433,7 @@ original encoding and number of source [code units]. From this hint data,
24312433

24322434
We start with a case analysis to enumerate all the meaningful encoding
24332435
combinations, subdividing the `latin1+utf16` encoding into either `latin1` or
2434-
`utf16` based on the `utf16_tag` flag set by `load_string`:
2436+
`utf16` based on the `UTF16_TAG` flag set by `load_string`:
24352437
```python
24362438
def store_string(cx, v: String, ptr):
24372439
begin, tagged_code_units = store_string_into_range(cx, v)
@@ -2442,9 +2444,9 @@ def store_string_into_range(cx, v: String):
24422444
src, src_encoding, src_tagged_code_units = v
24432445

24442446
if src_encoding == 'latin1+utf16':
2445-
if bool(src_tagged_code_units & utf16_tag(cx.opts)):
2447+
if bool(src_tagged_code_units & UTF16_TAG):
24462448
src_simple_encoding = 'utf16'
2447-
src_code_units = src_tagged_code_units ^ utf16_tag(cx.opts)
2449+
src_code_units = src_tagged_code_units ^ UTF16_TAG
24482450
else:
24492451
src_simple_encoding = 'latin1'
24502452
src_code_units = src_tagged_code_units
@@ -2477,12 +2479,11 @@ The simplest 4 cases above can compute the exact destination size and then copy
24772479
with a simply loop (that possibly inflates Latin-1 to UTF-16 by injecting a 0
24782480
byte after every Latin-1 byte).
24792481
```python
2480-
def max_string_byte_length(opts):
2481-
return (1 << (opts.memory.ptr_size() * 8 - 1)) - 1
2482+
MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
24822483

24832484
def store_string_copy(cx, src, src_code_units, dst_code_unit_size, dst_alignment, dst_encoding):
24842485
dst_byte_length = dst_code_unit_size * src_code_units
2485-
trap_if(dst_byte_length > max_string_byte_length(cx.opts))
2486+
trap_if(dst_byte_length > MAX_STRING_BYTE_LENGTH)
24862487
ptr = cx.opts.realloc(0, 0, dst_alignment, dst_byte_length)
24872488
trap_if(ptr != align_to(ptr, dst_alignment))
24882489
trap_if(ptr + dst_byte_length > len(cx.opts.memory.bytes))
@@ -2491,8 +2492,8 @@ def store_string_copy(cx, src, src_code_units, dst_code_unit_size, dst_alignment
24912492
cx.opts.memory.bytes[ptr : ptr+len(encoded)] = encoded
24922493
return (ptr, src_code_units)
24932494
```
2494-
The `max_string_byte_length` function ensures that the high bit of a
2495-
string's number of code units is never set, keeping it clear for `utf16_tag`.
2495+
The `MAX_STRING_BYTE_LENGTH` constant ensures that the high bit of a
2496+
string's number of code units is never set, keeping it clear for `UTF16_TAG`.
24962497

24972498
The 2 cases of transcoding into UTF-8 share an algorithm that starts by
24982499
optimistically assuming that each code unit of the source string fits in a
@@ -2508,14 +2509,14 @@ def store_latin1_to_utf8(cx, src, src_code_units):
25082509
return store_string_to_utf8(cx, src, src_code_units, worst_case_size)
25092510

25102511
def store_string_to_utf8(cx, src, src_code_units, worst_case_size):
2511-
assert(src_code_units <= max_string_byte_length(cx.opts))
2512+
assert(src_code_units <= MAX_STRING_BYTE_LENGTH)
25122513
ptr = cx.opts.realloc(0, 0, 1, src_code_units)
25132514
trap_if(ptr + src_code_units > len(cx.opts.memory.bytes))
25142515
for i,code_point in enumerate(src):
25152516
if ord(code_point) < 2**7:
25162517
cx.opts.memory.bytes[ptr + i] = ord(code_point)
25172518
else:
2518-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
2519+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
25192520
ptr = cx.opts.realloc(ptr, src_code_units, 1, worst_case_size)
25202521
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
25212522
encoded = src.encode('utf-8')
@@ -2534,7 +2535,7 @@ if multiple UTF-8 bytes were collapsed into a single 2-byte UTF-16 code unit:
25342535
```python
25352536
def store_utf8_to_utf16(cx, src, src_code_units):
25362537
worst_case_size = 2 * src_code_units
2537-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
2538+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
25382539
ptr = cx.opts.realloc(0, 0, 2, worst_case_size)
25392540
trap_if(ptr != align_to(ptr, 2))
25402541
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
@@ -2558,7 +2559,7 @@ after every Latin-1 byte (iterating in reverse to avoid clobbering later
25582559
bytes):
25592560
```python
25602561
def store_string_to_latin1_or_utf16(cx, src, src_code_units):
2561-
assert(src_code_units <= max_string_byte_length(cx.opts))
2562+
assert(src_code_units <= MAX_STRING_BYTE_LENGTH)
25622563
ptr = cx.opts.realloc(0, 0, 2, src_code_units)
25632564
trap_if(ptr != align_to(ptr, 2))
25642565
trap_if(ptr + src_code_units > len(cx.opts.memory.bytes))
@@ -2569,7 +2570,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
25692570
dst_byte_length += 1
25702571
else:
25712572
worst_case_size = 2 * src_code_units
2572-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
2573+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
25732574
ptr = cx.opts.realloc(ptr, src_code_units, 2, worst_case_size)
25742575
trap_if(ptr != align_to(ptr, 2))
25752576
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
@@ -2582,7 +2583,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
25822583
ptr = cx.opts.realloc(ptr, worst_case_size, 2, len(encoded))
25832584
trap_if(ptr != align_to(ptr, 2))
25842585
trap_if(ptr + len(encoded) > len(cx.opts.memory.bytes))
2585-
tagged_code_units = int(len(encoded) / 2) | utf16_tag(cx.opts)
2586+
tagged_code_units = int(len(encoded) / 2) | UTF16_TAG
25862587
return (ptr, tagged_code_units)
25872588
if dst_byte_length < src_code_units:
25882589
ptr = cx.opts.realloc(ptr, src_code_units, 2, dst_byte_length)
@@ -2604,14 +2605,14 @@ inexpensively fused with the UTF-16 validate+copy loop.)
26042605
```python
26052606
def store_probably_utf16_to_latin1_or_utf16(cx, src, src_code_units):
26062607
src_byte_length = 2 * src_code_units
2607-
trap_if(src_byte_length > max_string_byte_length(cx.opts))
2608+
trap_if(src_byte_length > MAX_STRING_BYTE_LENGTH)
26082609
ptr = cx.opts.realloc(0, 0, 2, src_byte_length)
26092610
trap_if(ptr != align_to(ptr, 2))
26102611
trap_if(ptr + src_byte_length > len(cx.opts.memory.bytes))
26112612
encoded = src.encode('utf-16-le')
26122613
cx.opts.memory.bytes[ptr : ptr+len(encoded)] = encoded
26132614
if any(ord(c) >= (1 << 8) for c in src):
2614-
tagged_code_units = int(len(encoded) / 2) | utf16_tag(cx.opts)
2615+
tagged_code_units = int(len(encoded) / 2) | UTF16_TAG
26152616
return (ptr, tagged_code_units)
26162617
latin1_size = int(len(encoded) / 2)
26172618
for i in range(latin1_size):
@@ -2631,7 +2632,9 @@ def lower_error_context(cx, v):
26312632
Lists and records are stored by recursively storing their elements and
26322633
are symmetric to the loading functions. Unlike strings, lists can
26332634
simply allocate based on the up-front knowledge of length and static
2634-
element size.
2635+
element size. Storing a list that exceeds the size of a 32-bit memory traps even
2636+
when storing on 64-bit platform to avoid having interfaces that 32-bit
2637+
components can't use.
26352638
```python
26362639
def store_list(cx, v, ptr, elem_type, maybe_length):
26372640
if maybe_length is not None:
@@ -2644,7 +2647,7 @@ def store_list(cx, v, ptr, elem_type, maybe_length):
26442647

26452648
def store_list_into_range(cx, v, elem_type):
26462649
byte_length = len(v) * elem_size(elem_type, cx.opts)
2647-
trap_if(byte_length >= (1 << (cx.opts.memory.ptr_size() * 8)))
2650+
trap_if(byte_length >= (1 << 32))
26482651
ptr = cx.opts.realloc(0, 0, alignment(elem_type, cx.opts), byte_length)
26492652
trap_if(ptr != align_to(ptr, alignment(elem_type, cx.opts)))
26502653
trap_if(ptr + byte_length > len(cx.opts.memory.bytes))

design/mvp/canonical-abi/definitions.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,8 +1267,7 @@ def load_string(cx, ptr) -> String:
12671267
tagged_code_units = load_int(cx, ptr + cx.opts.memory.ptr_size(), cx.opts.memory.ptr_size())
12681268
return load_string_from_range(cx, begin, tagged_code_units)
12691269

1270-
def utf16_tag(opts):
1271-
return 1 << (opts.memory.ptr_size() * 8 - 1)
1270+
UTF16_TAG = 1 << 31
12721271

12731272
def load_string_from_range(cx, ptr, tagged_code_units) -> String:
12741273
match cx.opts.string_encoding:
@@ -1282,8 +1281,8 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
12821281
encoding = 'utf-16-le'
12831282
case 'latin1+utf16':
12841283
alignment = 2
1285-
if bool(tagged_code_units & utf16_tag(cx.opts)):
1286-
byte_length = 2 * (tagged_code_units ^ utf16_tag(cx.opts))
1284+
if bool(tagged_code_units & UTF16_TAG):
1285+
byte_length = 2 * (tagged_code_units ^ UTF16_TAG)
12871286
encoding = 'utf-16-le'
12881287
else:
12891288
byte_length = tagged_code_units
@@ -1464,9 +1463,9 @@ def store_string_into_range(cx, v: String):
14641463
src, src_encoding, src_tagged_code_units = v
14651464

14661465
if src_encoding == 'latin1+utf16':
1467-
if bool(src_tagged_code_units & utf16_tag(cx.opts)):
1466+
if bool(src_tagged_code_units & UTF16_TAG):
14681467
src_simple_encoding = 'utf16'
1469-
src_code_units = src_tagged_code_units ^ utf16_tag(cx.opts)
1468+
src_code_units = src_tagged_code_units ^ UTF16_TAG
14701469
else:
14711470
src_simple_encoding = 'latin1'
14721471
src_code_units = src_tagged_code_units
@@ -1494,12 +1493,11 @@ def store_string_into_range(cx, v: String):
14941493
case 'latin1' : return store_string_copy(cx, src, src_code_units, 1, 2, 'latin-1')
14951494
case 'utf16' : return store_probably_utf16_to_latin1_or_utf16(cx, src, src_code_units)
14961495

1497-
def max_string_byte_length(opts):
1498-
return (1 << (opts.memory.ptr_size() * 8 - 1)) - 1
1496+
MAX_STRING_BYTE_LENGTH = (1 << 31) - 1
14991497

15001498
def store_string_copy(cx, src, src_code_units, dst_code_unit_size, dst_alignment, dst_encoding):
15011499
dst_byte_length = dst_code_unit_size * src_code_units
1502-
trap_if(dst_byte_length > max_string_byte_length(cx.opts))
1500+
trap_if(dst_byte_length > MAX_STRING_BYTE_LENGTH)
15031501
ptr = cx.opts.realloc(0, 0, dst_alignment, dst_byte_length)
15041502
trap_if(ptr != align_to(ptr, dst_alignment))
15051503
trap_if(ptr + dst_byte_length > len(cx.opts.memory.bytes))
@@ -1517,14 +1515,14 @@ def store_latin1_to_utf8(cx, src, src_code_units):
15171515
return store_string_to_utf8(cx, src, src_code_units, worst_case_size)
15181516

15191517
def store_string_to_utf8(cx, src, src_code_units, worst_case_size):
1520-
assert(src_code_units <= max_string_byte_length(cx.opts))
1518+
assert(src_code_units <= MAX_STRING_BYTE_LENGTH)
15211519
ptr = cx.opts.realloc(0, 0, 1, src_code_units)
15221520
trap_if(ptr + src_code_units > len(cx.opts.memory.bytes))
15231521
for i,code_point in enumerate(src):
15241522
if ord(code_point) < 2**7:
15251523
cx.opts.memory.bytes[ptr + i] = ord(code_point)
15261524
else:
1527-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
1525+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
15281526
ptr = cx.opts.realloc(ptr, src_code_units, 1, worst_case_size)
15291527
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
15301528
encoded = src.encode('utf-8')
@@ -1537,7 +1535,7 @@ def store_string_to_utf8(cx, src, src_code_units, worst_case_size):
15371535

15381536
def store_utf8_to_utf16(cx, src, src_code_units):
15391537
worst_case_size = 2 * src_code_units
1540-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
1538+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
15411539
ptr = cx.opts.realloc(0, 0, 2, worst_case_size)
15421540
trap_if(ptr != align_to(ptr, 2))
15431541
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
@@ -1551,7 +1549,7 @@ def store_utf8_to_utf16(cx, src, src_code_units):
15511549
return (ptr, code_units)
15521550

15531551
def store_string_to_latin1_or_utf16(cx, src, src_code_units):
1554-
assert(src_code_units <= max_string_byte_length(cx.opts))
1552+
assert(src_code_units <= MAX_STRING_BYTE_LENGTH)
15551553
ptr = cx.opts.realloc(0, 0, 2, src_code_units)
15561554
trap_if(ptr != align_to(ptr, 2))
15571555
trap_if(ptr + src_code_units > len(cx.opts.memory.bytes))
@@ -1562,7 +1560,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
15621560
dst_byte_length += 1
15631561
else:
15641562
worst_case_size = 2 * src_code_units
1565-
trap_if(worst_case_size > max_string_byte_length(cx.opts))
1563+
trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH)
15661564
ptr = cx.opts.realloc(ptr, src_code_units, 2, worst_case_size)
15671565
trap_if(ptr != align_to(ptr, 2))
15681566
trap_if(ptr + worst_case_size > len(cx.opts.memory.bytes))
@@ -1575,7 +1573,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
15751573
ptr = cx.opts.realloc(ptr, worst_case_size, 2, len(encoded))
15761574
trap_if(ptr != align_to(ptr, 2))
15771575
trap_if(ptr + len(encoded) > len(cx.opts.memory.bytes))
1578-
tagged_code_units = int(len(encoded) / 2) | utf16_tag(cx.opts)
1576+
tagged_code_units = int(len(encoded) / 2) | UTF16_TAG
15791577
return (ptr, tagged_code_units)
15801578
if dst_byte_length < src_code_units:
15811579
ptr = cx.opts.realloc(ptr, src_code_units, 2, dst_byte_length)
@@ -1585,14 +1583,14 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
15851583

15861584
def store_probably_utf16_to_latin1_or_utf16(cx, src, src_code_units):
15871585
src_byte_length = 2 * src_code_units
1588-
trap_if(src_byte_length > max_string_byte_length(cx.opts))
1586+
trap_if(src_byte_length > MAX_STRING_BYTE_LENGTH)
15891587
ptr = cx.opts.realloc(0, 0, 2, src_byte_length)
15901588
trap_if(ptr != align_to(ptr, 2))
15911589
trap_if(ptr + src_byte_length > len(cx.opts.memory.bytes))
15921590
encoded = src.encode('utf-16-le')
15931591
cx.opts.memory.bytes[ptr : ptr+len(encoded)] = encoded
15941592
if any(ord(c) >= (1 << 8) for c in src):
1595-
tagged_code_units = int(len(encoded) / 2) | utf16_tag(cx.opts)
1593+
tagged_code_units = int(len(encoded) / 2) | UTF16_TAG
15961594
return (ptr, tagged_code_units)
15971595
latin1_size = int(len(encoded) / 2)
15981596
for i in range(latin1_size):
@@ -1615,7 +1613,7 @@ def store_list(cx, v, ptr, elem_type, maybe_length):
16151613

16161614
def store_list_into_range(cx, v, elem_type):
16171615
byte_length = len(v) * elem_size(elem_type, cx.opts)
1618-
trap_if(byte_length >= (1 << (cx.opts.memory.ptr_size() * 8)))
1616+
trap_if(byte_length >= (1 << 32))
16191617
ptr = cx.opts.realloc(0, 0, alignment(elem_type, cx.opts), byte_length)
16201618
trap_if(ptr != align_to(ptr, alignment(elem_type, cx.opts)))
16211619
trap_if(ptr + byte_length > len(cx.opts.memory.bytes))

design/mvp/canonical-abi/run_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def test_string(src_encoding, dst_encoding, s, addr_type='i32'):
267267
except UnicodeEncodeError:
268268
pass
269269
encoded = s.encode('utf-16-le')
270-
tagged_code_units = int(len(encoded) / 2) | utf16_tag(LiftLowerOptions(memory=MemInst(bytearray(), addr_type)))
270+
tagged_code_units = int(len(encoded) / 2) | UTF16_TAG
271271
test_string_internal(src_encoding, dst_encoding, s, encoded, tagged_code_units, addr_type)
272272

273273
encodings = ['utf8', 'utf16', 'latin1+utf16']

0 commit comments

Comments
 (0)