@@ -2123,10 +2123,13 @@ def convert_i32_to_char(cx, i):
21232123```
21242124
21252125Strings are loaded from two pointer-sized values: a pointer (offset in linear
2126- memory) and a number of [ code units] . There are three supported string
2127- encodings in [ ` canonopt ` ] : [ UTF-8] , [ UTF-16] and ` latin1+utf16 ` . This last
2128- option allows a * dynamic* choice between [ Latin-1] and UTF-16, indicated by
2129- the high bit of the second pointer-sized value. String values include their
2126+ memory) and a number of [ code units] . There are three supported string encodings
2127+ in [ ` canonopt ` ] : [ UTF-8] , [ UTF-16] and ` latin1+utf16 ` . This last option allows a
2128+ * dynamic* choice between [ Latin-1] and UTF-16, indicated by the 32nd bit of the
2129+ second pointer-sized value. The length of a string is limited so that the number
2130+ of code units fits in 31 bits (leaving the 32nd bit free as the flag). This
2131+ maximum length is enforced even on 64-bit memories to ensure they don't define
2132+ interfaces which 32-bit components couldn't handle. String values include their
21302133original encoding and length in tagged code units as a "hint" that enables
21312134` store_string ` (defined below) to make better up-front allocation size choices
21322135in many cases. Thus, the value produced by ` load_string ` isn't simply a Python
@@ -2140,8 +2143,7 @@ def load_string(cx, ptr) -> String:
21402143 tagged_code_units = load_int(cx, ptr + cx.opts.memory.ptr_size(), cx.opts.memory.ptr_size())
21412144 return load_string_from_range(cx, begin, tagged_code_units)
21422145
2143- def utf16_tag (opts ):
2144- return 1 << (opts.memory.ptr_size() * 8 - 1 )
2146+ UTF16_TAG = 1 << 31
21452147
21462148def load_string_from_range (cx , ptr , tagged_code_units ) -> String:
21472149 match cx.opts.string_encoding:
@@ -2155,8 +2157,8 @@ def load_string_from_range(cx, ptr, tagged_code_units) -> String:
21552157 encoding = ' utf-16-le'
21562158 case ' latin1+utf16' :
21572159 alignment = 2
2158- if bool (tagged_code_units & utf16_tag(cx.opts) ):
2159- byte_length = 2 * (tagged_code_units ^ utf16_tag(cx.opts) )
2160+ if bool (tagged_code_units & UTF16_TAG ):
2161+ byte_length = 2 * (tagged_code_units ^ UTF16_TAG )
21602162 encoding = ' utf-16-le'
21612163 else :
21622164 byte_length = tagged_code_units
@@ -2431,7 +2433,7 @@ original encoding and number of source [code units]. From this hint data,
24312433
24322434We start with a case analysis to enumerate all the meaningful encoding
24332435combinations, subdividing the ` latin1+utf16 ` encoding into either ` latin1 ` or
2434- ` utf16 ` based on the ` utf16_tag ` flag set by ` load_string ` :
2436+ ` utf16 ` based on the ` UTF16_TAG ` flag set by ` load_string ` :
24352437``` python
24362438def store_string (cx , v : String, ptr ):
24372439 begin, tagged_code_units = store_string_into_range(cx, v)
@@ -2442,9 +2444,9 @@ def store_string_into_range(cx, v: String):
24422444 src, src_encoding, src_tagged_code_units = v
24432445
24442446 if src_encoding == ' latin1+utf16' :
2445- if bool (src_tagged_code_units & utf16_tag(cx.opts) ):
2447+ if bool (src_tagged_code_units & UTF16_TAG ):
24462448 src_simple_encoding = ' utf16'
2447- src_code_units = src_tagged_code_units ^ utf16_tag(cx.opts)
2449+ src_code_units = src_tagged_code_units ^ UTF16_TAG
24482450 else :
24492451 src_simple_encoding = ' latin1'
24502452 src_code_units = src_tagged_code_units
@@ -2477,12 +2479,11 @@ The simplest 4 cases above can compute the exact destination size and then copy
24772479with a simply loop (that possibly inflates Latin-1 to UTF-16 by injecting a 0
24782480byte after every Latin-1 byte).
24792481``` python
2480- def max_string_byte_length (opts ):
2481- return (1 << (opts.memory.ptr_size() * 8 - 1 )) - 1
2482+ MAX_STRING_BYTE_LENGTH = (1 << 31 ) - 1
24822483
24832484def store_string_copy (cx , src , src_code_units , dst_code_unit_size , dst_alignment , dst_encoding ):
24842485 dst_byte_length = dst_code_unit_size * src_code_units
2485- trap_if(dst_byte_length > max_string_byte_length(cx.opts) )
2486+ trap_if(dst_byte_length > MAX_STRING_BYTE_LENGTH )
24862487 ptr = cx.opts.realloc(0 , 0 , dst_alignment, dst_byte_length)
24872488 trap_if(ptr != align_to(ptr, dst_alignment))
24882489 trap_if(ptr + dst_byte_length > len (cx.opts.memory.bytes))
@@ -2491,8 +2492,8 @@ def store_string_copy(cx, src, src_code_units, dst_code_unit_size, dst_alignment
24912492 cx.opts.memory.bytes[ptr : ptr+ len (encoded)] = encoded
24922493 return (ptr, src_code_units)
24932494```
2494- The ` max_string_byte_length ` function ensures that the high bit of a
2495- string's number of code units is never set, keeping it clear for ` utf16_tag ` .
2495+ The ` MAX_STRING_BYTE_LENGTH ` constant ensures that the high bit of a
2496+ string's number of code units is never set, keeping it clear for ` UTF16_TAG ` .
24962497
24972498The 2 cases of transcoding into UTF-8 share an algorithm that starts by
24982499optimistically assuming that each code unit of the source string fits in a
@@ -2508,14 +2509,14 @@ def store_latin1_to_utf8(cx, src, src_code_units):
25082509 return store_string_to_utf8(cx, src, src_code_units, worst_case_size)
25092510
25102511def store_string_to_utf8 (cx , src , src_code_units , worst_case_size ):
2511- assert (src_code_units <= max_string_byte_length(cx.opts) )
2512+ assert (src_code_units <= MAX_STRING_BYTE_LENGTH )
25122513 ptr = cx.opts.realloc(0 , 0 , 1 , src_code_units)
25132514 trap_if(ptr + src_code_units > len (cx.opts.memory.bytes))
25142515 for i,code_point in enumerate (src):
25152516 if ord (code_point) < 2 ** 7 :
25162517 cx.opts.memory.bytes[ptr + i] = ord (code_point)
25172518 else :
2518- trap_if(worst_case_size > max_string_byte_length(cx.opts) )
2519+ trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH )
25192520 ptr = cx.opts.realloc(ptr, src_code_units, 1 , worst_case_size)
25202521 trap_if(ptr + worst_case_size > len (cx.opts.memory.bytes))
25212522 encoded = src.encode(' utf-8' )
@@ -2534,7 +2535,7 @@ if multiple UTF-8 bytes were collapsed into a single 2-byte UTF-16 code unit:
25342535``` python
25352536def store_utf8_to_utf16 (cx , src , src_code_units ):
25362537 worst_case_size = 2 * src_code_units
2537- trap_if(worst_case_size > max_string_byte_length(cx.opts) )
2538+ trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH )
25382539 ptr = cx.opts.realloc(0 , 0 , 2 , worst_case_size)
25392540 trap_if(ptr != align_to(ptr, 2 ))
25402541 trap_if(ptr + worst_case_size > len (cx.opts.memory.bytes))
@@ -2558,7 +2559,7 @@ after every Latin-1 byte (iterating in reverse to avoid clobbering later
25582559bytes):
25592560``` python
25602561def store_string_to_latin1_or_utf16 (cx , src , src_code_units ):
2561- assert (src_code_units <= max_string_byte_length(cx.opts) )
2562+ assert (src_code_units <= MAX_STRING_BYTE_LENGTH )
25622563 ptr = cx.opts.realloc(0 , 0 , 2 , src_code_units)
25632564 trap_if(ptr != align_to(ptr, 2 ))
25642565 trap_if(ptr + src_code_units > len (cx.opts.memory.bytes))
@@ -2569,7 +2570,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
25692570 dst_byte_length += 1
25702571 else :
25712572 worst_case_size = 2 * src_code_units
2572- trap_if(worst_case_size > max_string_byte_length(cx.opts) )
2573+ trap_if(worst_case_size > MAX_STRING_BYTE_LENGTH )
25732574 ptr = cx.opts.realloc(ptr, src_code_units, 2 , worst_case_size)
25742575 trap_if(ptr != align_to(ptr, 2 ))
25752576 trap_if(ptr + worst_case_size > len (cx.opts.memory.bytes))
@@ -2582,7 +2583,7 @@ def store_string_to_latin1_or_utf16(cx, src, src_code_units):
25822583 ptr = cx.opts.realloc(ptr, worst_case_size, 2 , len (encoded))
25832584 trap_if(ptr != align_to(ptr, 2 ))
25842585 trap_if(ptr + len (encoded) > len (cx.opts.memory.bytes))
2585- tagged_code_units = int (len (encoded) / 2 ) | utf16_tag(cx.opts)
2586+ tagged_code_units = int (len (encoded) / 2 ) | UTF16_TAG
25862587 return (ptr, tagged_code_units)
25872588 if dst_byte_length < src_code_units:
25882589 ptr = cx.opts.realloc(ptr, src_code_units, 2 , dst_byte_length)
@@ -2604,14 +2605,14 @@ inexpensively fused with the UTF-16 validate+copy loop.)
26042605``` python
26052606def store_probably_utf16_to_latin1_or_utf16 (cx , src , src_code_units ):
26062607 src_byte_length = 2 * src_code_units
2607- trap_if(src_byte_length > max_string_byte_length(cx.opts) )
2608+ trap_if(src_byte_length > MAX_STRING_BYTE_LENGTH )
26082609 ptr = cx.opts.realloc(0 , 0 , 2 , src_byte_length)
26092610 trap_if(ptr != align_to(ptr, 2 ))
26102611 trap_if(ptr + src_byte_length > len (cx.opts.memory.bytes))
26112612 encoded = src.encode(' utf-16-le' )
26122613 cx.opts.memory.bytes[ptr : ptr+ len (encoded)] = encoded
26132614 if any (ord (c) >= (1 << 8 ) for c in src):
2614- tagged_code_units = int (len (encoded) / 2 ) | utf16_tag(cx.opts)
2615+ tagged_code_units = int (len (encoded) / 2 ) | UTF16_TAG
26152616 return (ptr, tagged_code_units)
26162617 latin1_size = int (len (encoded) / 2 )
26172618 for i in range (latin1_size):
@@ -2631,7 +2632,9 @@ def lower_error_context(cx, v):
26312632Lists and records are stored by recursively storing their elements and
26322633are symmetric to the loading functions. Unlike strings, lists can
26332634simply allocate based on the up-front knowledge of length and static
2634- element size.
2635+ element size. Storing a list that exceeds the size of a 32-bit memory traps even
2636+ when storing on 64-bit platform to avoid having interfaces that 32-bit
2637+ components can't use.
26352638``` python
26362639def store_list (cx , v , ptr , elem_type , maybe_length ):
26372640 if maybe_length is not None :
@@ -2644,7 +2647,7 @@ def store_list(cx, v, ptr, elem_type, maybe_length):
26442647
26452648def store_list_into_range (cx , v , elem_type ):
26462649 byte_length = len (v) * elem_size(elem_type, cx.opts)
2647- trap_if(byte_length >= (1 << (cx.opts.memory.ptr_size() * 8 ) ))
2650+ trap_if(byte_length >= (1 << 32 ))
26482651 ptr = cx.opts.realloc(0 , 0 , alignment(elem_type, cx.opts), byte_length)
26492652 trap_if(ptr != align_to(ptr, alignment(elem_type, cx.opts)))
26502653 trap_if(ptr + byte_length > len (cx.opts.memory.bytes))
0 commit comments