From 4c83a09a34acd02a1150292f64575eccc2fd3201 Mon Sep 17 00:00:00 2001 From: Ralph Date: Sat, 27 Jun 2026 00:35:11 -0700 Subject: [PATCH 1/2] =?UTF-8?q?fix(intl):=20#5585=20=E2=80=94=20ECMA-402?= =?UTF-8?q?=20option/locale=20validation=20for=20the=20six=20small=20const?= =?UTF-8?q?ructors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings the small Intl constructors (Collator, Segmenter, ListFormat, RelativeTimeFormat, PluralRules) up to spec on option coercion, locale-list handling, and a few behavioral gaps. test262 intl402 parity across these six dirs goes from 241/387 (62%) to 297/387 (77%) — 56 cases fixed, no regressions. Shared helpers (intl.rs): - `get_option_string_coerced` / `enum_option_strict` — GetOption with full ToString coercion (null → "null", Symbol → TypeError), so `options-*-invalid` values are rejected instead of silently defaulting. - `get_options_object` (GetOptionsObject: non-Object/non-undefined → TypeError) and `coerce_options_reject_null` (ToObject: null → TypeError), wired per the spec step each constructor uses. - `locales_from_value` now follows CanonicalizeLocaleList: null → TypeError, array-like Objects are iterated (`{0:"DE",length:1}` → `["de"]`), element type-checks throw TypeError, and a throwing/Symbol `length` propagates. - `supportedLocalesOf` validates its `options` arg (localeMatcher) and filters the request list through a BestAvailableLocale check, so the result length matches (e.g. `zxx` is dropped). - Per-constructor: read `localeMatcher` (+ `numberingSystem` for RTF) in the exact GetOption order the `options-order` / read-order tests assert. Collator (date_collator.rs): - NFD-normalize before comparing so canonically-equivalent strings collate 0. - Resolve `numeric`/`caseFirst`/`collation` from options or the locale's `-u-kn`/`-u-kf`/`-u-co` extension keywords; reflect them (plus `usage`, `sensitivity`, `ignorePunctuation`, `collation`) in resolvedOptions in spec order. `ignorePunctuation` now strips ignorable punctuation in compare. RelativeTimeFormat / PluralRules (list_relative_plural.rs): - RTF format/formatToParts use full ToNumber (Symbol → TypeError, object valueOf honoured); units are case-sensitive; formatToParts attaches the `unit` field to the numeric parts. - PluralRules reads options in SetNumberFormatDigitOptions order, adds `notation`/`compactDisplay`, and `selectRange` throws TypeError for undefined or Symbol arguments. Out of scope (still failing, need CLDR data or larger runtime work): locale- specific formatting/collation, the prototype accessor-getter shape for compare/format, %Segments.prototype%.containing, subclassing/new.target, and @@toStringTag inheritance on instances. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/perry-runtime/src/intl.rs | 359 ++++++++++++++++-- .../perry-runtime/src/intl/date_collator.rs | 92 ++++- .../src/intl/list_relative_plural.rs | 78 +++- 3 files changed, 481 insertions(+), 48 deletions(-) diff --git a/crates/perry-runtime/src/intl.rs b/crates/perry-runtime/src/intl.rs index 89e21b740d..325a117b6d 100644 --- a/crates/perry-runtime/src/intl.rs +++ b/crates/perry-runtime/src/intl.rs @@ -154,6 +154,14 @@ const KEY_NF_TRAILING_ZERO: &str = "__intlNfTrailingZero"; // `format` getter reads it from here so user mutation/deletion of the public // property can't corrupt what the accessor returns. const KEY_NF_BOUND_FORMAT: &str = "__intlNfBoundFormat"; +const KEY_COL_USAGE: &str = "__intlColUsage"; +const KEY_COL_SENSITIVITY: &str = "__intlColSensitivity"; +const KEY_COL_IGNORE_PUNCT: &str = "__intlColIgnorePunct"; +const KEY_COL_COLLATION: &str = "__intlColCollation"; +const KEY_COL_NUMERIC: &str = "__intlColNumeric"; +const KEY_COL_CASE_FIRST: &str = "__intlColCaseFirst"; +const KEY_PR_NOTATION: &str = "__intlPrNotation"; +const KEY_PR_COMPACT_DISPLAY: &str = "__intlPrCompactDisplay"; fn undefined() -> f64 { f64::from_bits(crate::value::TAG_UNDEFINED) @@ -541,34 +549,95 @@ fn canonicalize_language_tag(tag: &str) -> Option { } } +/// CanonicalizeLocaleList element handler: a present element must be a String or +/// an Object (an `Intl.Locale` or anything ToString-able), else `TypeError`; the +/// resulting tag is canonicalized (`RangeError` if structurally invalid) and +/// pushed if not already present. +fn push_locale_element(out: &mut Vec, value: f64) { + let jv = JSValue::from_bits(value.to_bits()); + let tag = if jv.is_any_string() { + string_from_string_value(value).unwrap_or_default() + } else if object_ptr_from_value(value).is_some() { + value_to_string(value) + } else { + // undefined / null / boolean / number / Symbol element → TypeError. + throw_type_error("locale must be a String or Object"); + }; + let Some(canonical) = canonical_locale(&tag) else { + throw_invalid_language_tag(&tag); + }; + if !out.iter().any(|existing| existing == &canonical) { + out.push(canonical); + } +} + fn locales_from_value(locales: f64) -> Vec { let js = JSValue::from_bits(locales.to_bits()); - if js.is_undefined() || js.is_null() { + // CanonicalizeLocaleList(undefined) is the empty list; `null` fails ToObject + // with a TypeError (everything else is a String or coerces via ToObject). + if js.is_undefined() { return Vec::new(); } + if js.is_null() { + throw_type_error("Cannot convert undefined or null to object"); + } + // A String argument is treated as a single-element list (not iterated by char). + if js.is_any_string() { + let tag = string_from_string_value(locales).unwrap_or_default(); + let Some(canonical) = canonical_locale(&tag) else { + throw_invalid_language_tag(&tag); + }; + return vec![canonical]; + } if let Some(arr) = array_ptr_from_value(locales) { let len = js_array_length(arr); let mut out = Vec::with_capacity(len as usize); for i in 0..len { - let value = js_array_get_f64(arr, i); - if let Some(tag) = string_from_string_value(value) { - let Some(canonical) = canonical_locale(&tag) else { - throw_invalid_language_tag(&tag); - }; - out.push(canonical); - } + push_locale_element(&mut out, js_array_get_f64(arr, i)); } return out; } - if let Some(tag) = string_from_string_value(locales) { - let Some(canonical) = canonical_locale(&tag) else { - throw_invalid_language_tag(&tag); + // CanonicalizeLocaleList on a generic array-like Object: iterate `O[0..length]` + // (e.g. `{ 0: "DE", length: 1 }` → `["de"]`). + if let Some(obj) = object_ptr_from_value(locales) { + // `length = ? ToLength(? Get(O, "length"))`: a throwing `length` getter or + // ToNumber step (Symbol / abrupt valueOf/toString) propagates here. + let len_raw = get_field(obj, "length"); + let len_num = crate::builtins::js_number_coerce(len_raw); + let len = if len_num.is_finite() && len_num > 0.0 { + len_num as u32 + } else { + 0 }; - return vec![canonical]; + let mut out = Vec::with_capacity(len as usize); + for i in 0..len { + push_locale_element(&mut out, get_field(obj, &i.to_string())); + } + return out; } + // Other primitives (number/boolean/Symbol/BigInt): ToObject yields a wrapper + // with length 0 — an empty list, no throw. Vec::new() } +/// BestAvailableLocale (lookup) — a requested canonical locale is "supported" +/// when its primary language subtag is one Perry's deterministic formatters can +/// service. Perry carries no CLDR locale database, so this is a curated set of +/// common CLDR languages rather than a data lookup: it is enough to distinguish +/// real languages (`en`, `de`, `zh`, …) from the "no linguistic content" tag +/// `zxx` and other unsupported primaries that `supportedLocalesOf` must drop. +fn is_available_locale(canonical: &str) -> bool { + let primary = canonical.split(['-', '_']).next().unwrap_or(canonical); + const AVAILABLE_LANGUAGES: &[&str] = &[ + "af", "am", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", + "es", "et", "eu", "fa", "fi", "fil", "fr", "ga", "gl", "gu", "he", "hi", "hr", "hu", "hy", + "id", "is", "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", + "mn", "mr", "ms", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "si", "sk", + "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", "tr", "uk", "ur", "uz", "vi", "zh", "zu", + ]; + AVAILABLE_LANGUAGES.contains(&primary) +} + fn locale_or_default(locales: f64) -> String { locales_from_value(locales) .into_iter() @@ -576,6 +645,47 @@ fn locale_or_default(locales: f64) -> String { .unwrap_or_else(|| "en-US".to_string()) } +/// Look up a Unicode (`-u-`) extension keyword's value in a BCP-47 tag. Returns +/// `Some(value)` if the 2-letter `key` is present (the value is the `-`-joined +/// run of type subtags after it, or `""` for a value-less boolean key like +/// `-u-kn`), else `None`. Case-insensitive. Used to resolve `kn`/`kf`/`co` for +/// Collator when the corresponding option is absent (numeric-and-caseFirst.js). +fn unicode_extension_keyword(locale: &str, key: &str) -> Option { + let lower = locale.to_ascii_lowercase(); + let key = key.to_ascii_lowercase(); + let mut iter = lower.split('-'); + // Advance to the `u` singleton. + let mut in_u = false; + while let Some(p) = iter.next() { + if p == "u" { + in_u = true; + break; + } + } + if !in_u { + return None; + } + let mut found = false; + let mut value: Vec<&str> = Vec::new(); + for p in iter { + if p.len() == 1 { + // Next singleton ends the `u` extension. + break; + } + if p.len() == 2 && p.chars().all(|c| c.is_ascii_alphanumeric()) { + if found { + break; // reached the next keyword + } + if p == key { + found = true; + } + } else if found { + value.push(p); + } + } + found.then(|| value.join("-")) +} + fn rest_arg(rest: f64, index: u32) -> f64 { let Some(arr) = array_ptr_from_value(rest) else { return undefined(); @@ -724,6 +834,74 @@ fn enum_option(options: f64, key: &str, allowed: &[&str], default: &str) -> Stri } } +/// `GetOption(options, key, "string", ...)` with full `ToString` coercion: only +/// `undefined` selects the default. `null`, numbers, booleans, etc. are coerced +/// via `ToString` (so `null` → `"null"`, never the absent path), and a Symbol +/// throws `TypeError` (ToString of a Symbol is a TypeError). This is the strict +/// spec behavior; `get_option_string` instead treats `null` as absent, which the +/// `options-*-invalid` value-validation tests reject. +fn get_option_string_coerced(options: f64, key: &str) -> Option { + let raw = get_option_value(options, key); + let jv = JSValue::from_bits(raw.to_bits()); + if jv.is_undefined() { + None + } else if jv.is_any_string() { + string_from_string_value(raw) + } else if unsafe { crate::symbol::js_is_symbol(raw) } != 0 { + throw_type_error(&format!( + "Cannot convert a Symbol value to a string for Intl options property {key}" + )); + } else { + Some(value_to_string(raw)) + } +} + +/// `GetOption` with an enumerated value set, using strict `ToString` coercion +/// (see [`get_option_string_coerced`]): an out-of-range value (including a +/// `ToString`-coerced `null` / number) is a `RangeError`; absent → `default`. +fn enum_option_strict(options: f64, key: &str, allowed: &[&str], default: &str) -> String { + match get_option_string_coerced(options, key) { + None => default.to_string(), + Some(value) => { + if allowed.contains(&value.as_str()) { + value + } else { + throw_range_error(&format!( + "Value {value} out of range for Intl options property {key}" + )) + } + } + } +} + +/// `GetOptionsObject(options)`: `undefined` yields an empty bag (reported as +/// `undefined`, which the option readers treat as "every key absent"); an Object +/// passes through unchanged; any other value (including `null`, primitives, and +/// BigInt) throws `TypeError`. Used by the constructors whose spec step is +/// `GetOptionsObject` (ListFormat, Segmenter, PluralRules, …). +fn get_options_object(options: f64) -> f64 { + let jv = JSValue::from_bits(options.to_bits()); + if jv.is_undefined() { + return options; + } + if object_ptr_from_value(options).is_some() { + return options; + } + throw_type_error("Cannot convert undefined or null to object"); +} + +/// `CoerceOptionsToObject(options)` partial: `undefined` stays an empty bag and +/// `null` throws `TypeError` (`ToObject(null)`). Primitives are *not* boxed here +/// — Perry reads option keys directly off Objects, so a primitive simply yields +/// every-key-absent — but `null` must still reject. Used by the constructors +/// whose spec step is `ToObject` (RelativeTimeFormat, Collator, …). +fn coerce_options_reject_null(options: f64) -> f64 { + if JSValue::from_bits(options.to_bits()).is_null() { + throw_type_error("Cannot convert undefined or null to object"); + } + options +} + /// GetBooleanOption(options, key): `undefined` → `None`, otherwise ToBoolean. fn get_bool_option(options: f64, key: &str) -> Option { let value = get_option_value(options, key); @@ -1076,6 +1254,63 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_COLLATOR => { + // InitializeCollator reads options via `? ToObject(options)` (null → + // TypeError) then GetOption in this exact order: usage, localeMatcher, + // collation, numeric, caseFirst, sensitivity, ignorePunctuation + // (constructor-options-throwing-getters / resolvedOptions order.js). + let options = coerce_options_reject_null(options); + let usage = enum_option_strict(options, "usage", &["sort", "search"], "sort"); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + // `collation` is a free-form `type` string (RangeError if malformed); + // it has no CLDR effect here, so it resolves to "default". + if let Some(collation) = get_option_string_coerced(options, "collation") { + if !is_well_formed_numbering_system(&collation) { + throw_range_error(&format!( + "Value {collation} out of range for Intl options property collation" + )); + } + } + let numeric_opt = get_bool_option(options, "numeric"); + let case_first_opt = + get_option_string_coerced(options, "caseFirst").map(|v| { + if ["upper", "lower", "false"].contains(&v.as_str()) { + v + } else { + throw_range_error(&format!( + "Value {v} out of range for Intl options property caseFirst" + )) + } + }); + let sensitivity = enum_option_strict( + options, + "sensitivity", + &["base", "accent", "case", "variant"], + "variant", + ); + let ignore_punct = get_bool_option(options, "ignorePunctuation").unwrap_or(false); + // ResolveLocale: when an option is absent, fall back to the matching + // Unicode (`-u-`) extension keyword in the resolved locale — `kn` + // (numeric, value-less ⇒ true) and `kf` (caseFirst). + let numeric = numeric_opt.unwrap_or_else(|| { + match unicode_extension_keyword(&locale, "kn") { + Some(v) => v != "false", + None => false, + } + }); + let case_first = case_first_opt.unwrap_or_else(|| { + unicode_extension_keyword(&locale, "kf") + .filter(|v| ["upper", "lower", "false"].contains(&v.as_str())) + .unwrap_or_else(|| "false".to_string()) + }); + let collation = unicode_extension_keyword(&locale, "co") + .filter(|v| !v.is_empty() && v != "standard" && v != "search") + .unwrap_or_else(|| "default".to_string()); + set_internal_field(obj, KEY_COL_USAGE, string_value(&usage)); + set_internal_field(obj, KEY_COL_SENSITIVITY, string_value(&sensitivity)); + set_internal_field(obj, KEY_COL_IGNORE_PUNCT, bool_value(ignore_punct)); + set_internal_field(obj, KEY_COL_COLLATION, string_value(&collation)); + set_internal_field(obj, KEY_COL_NUMERIC, bool_value(numeric)); + set_internal_field(obj, KEY_COL_CASE_FIRST, string_value(&case_first)); install_bound_instance_function( obj, "compare", @@ -1090,7 +1325,12 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_SEGMENTER => { - let granularity = normalize_granularity(get_option_string(options, "granularity")); + // `? ToObject(options)` (null → TypeError), then GetOption in order: + // localeMatcher, granularity (options-order.js / options-null.js). + let options = coerce_options_reject_null(options); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let granularity = + normalize_granularity(get_option_string_coerced(options, "granularity")); set_internal_field(obj, KEY_GRANULARITY, string_value(&granularity)); install_bound_instance_function( obj, @@ -1106,13 +1346,18 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_LIST_FORMAT => { - let list_type = enum_option( + // `? GetOptionsObject(options)` (any non-Object, non-undefined → + // TypeError), then GetOption: localeMatcher, type, style + // (options-getoptionsobject.js / options-order.js). + let options = get_options_object(options); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let list_type = enum_option_strict( options, "type", &["conjunction", "disjunction", "unit"], "conjunction", ); - let style = enum_option(options, "style", &["long", "short", "narrow"], "long"); + let style = enum_option_strict(options, "style", &["long", "short", "narrow"], "long"); set_internal_field(obj, KEY_TYPE, string_value(&list_type)); set_internal_field(obj, KEY_LF_STYLE, string_value(&style)); install_bound_instance_function( @@ -1135,8 +1380,19 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_RELATIVE_TIME => { - let style = enum_option(options, "style", &["long", "short", "narrow"], "long"); - let numeric = enum_option(options, "numeric", &["always", "auto"], "always"); + // `? ToObject(options)` (null → TypeError), then GetOption in order: + // localeMatcher, numberingSystem, style, numeric (options-order.js). + let options = coerce_options_reject_null(options); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + if let Some(ns) = get_option_string_coerced(options, "numberingSystem") { + if !is_well_formed_numbering_system(&ns) { + throw_range_error(&format!( + "Value {ns} out of range for Intl options property numberingSystem" + )); + } + } + let style = enum_option_strict(options, "style", &["long", "short", "narrow"], "long"); + let numeric = enum_option_strict(options, "numeric", &["always", "auto"], "always"); set_internal_field(obj, KEY_RTF_STYLE, string_value(&style)); set_internal_field(obj, KEY_NUMERIC, string_value(&numeric)); install_bound_instance_function(obj, "format", rtf_bound_format_thunk as *const u8, 2); @@ -1154,21 +1410,49 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_PLURAL_RULES => { - let pr_type = enum_option(options, "type", &["cardinal", "ordinal"], "cardinal"); + // `? GetOptionsObject(options)`, then GetOption in the exact order + // constructor-option-read-order.js asserts: localeMatcher, type, + // notation, compactDisplay, then SetNumberFormatDigitOptions + // (minimumIntegerDigits, min/maxFractionDigits, min/maxSignificantDigits, + // roundingIncrement, roundingMode, roundingPriority, trailingZeroDisplay). + let options = get_options_object(options); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let pr_type = enum_option_strict(options, "type", &["cardinal", "ordinal"], "cardinal"); set_internal_field(obj, KEY_TYPE, string_value(&pr_type)); + let notation = enum_option_strict( + options, + "notation", + &["standard", "scientific", "engineering", "compact"], + "standard", + ); + let compact_display = + enum_option_strict(options, "compactDisplay", &["short", "long"], "short"); + set_internal_field(obj, KEY_PR_NOTATION, string_value(¬ation)); + if notation == "compact" { + set_internal_field(obj, KEY_PR_COMPACT_DISPLAY, string_value(&compact_display)); + } let min_int = get_option_number(options, "minimumIntegerDigits").unwrap_or(1.0); set_internal_field(obj, KEY_PR_MIN_INT, min_int); + let min_frac_read = get_option_number(options, "minimumFractionDigits"); + let max_frac_read = get_option_number(options, "maximumFractionDigits"); let min_sig = get_option_number(options, "minimumSignificantDigits"); let max_sig = get_option_number(options, "maximumSignificantDigits"); + // Trailing SetNumberFormatDigitOptions reads — observed for read-order + // parity even though Perry's plural selection ignores their values. + let _ = get_option_value(options, "roundingIncrement"); + let _ = get_option_value(options, "roundingMode"); + let _ = get_option_value(options, "roundingPriority"); + let _ = get_option_value(options, "trailingZeroDisplay"); if min_sig.is_some() || max_sig.is_some() { set_internal_field(obj, KEY_PR_USE_SIG, bool_value(true)); set_internal_field(obj, KEY_PR_MIN_SIG, min_sig.unwrap_or(1.0)); set_internal_field(obj, KEY_PR_MAX_SIG, max_sig.unwrap_or(21.0)); } else { set_internal_field(obj, KEY_PR_USE_SIG, bool_value(false)); - let min_frac = get_option_number(options, "minimumFractionDigits").unwrap_or(0.0); - let max_frac = get_option_number(options, "maximumFractionDigits") - .unwrap_or_else(|| min_frac.max(3.0)); + // Reuse the values read above (in spec order) — re-reading would + // double-invoke the option getters and break read-order parity. + let min_frac = min_frac_read.unwrap_or(0.0); + let max_frac = max_frac_read.unwrap_or_else(|| min_frac.max(3.0)); set_internal_field(obj, KEY_PR_MIN_FRAC, min_frac); set_internal_field(obj, KEY_PR_MAX_FRAC, max_frac); } @@ -1288,17 +1572,29 @@ extern "C" fn plural_rules_constructor_thunk(closure: *const ClosureHeader, rest ) } -fn supported_locales_array(locales: f64) -> f64 { - let locales = locales_from_value(locales); - let mut arr = js_array_alloc(locales.len() as u32); - for locale in locales { - arr = js_array_push_f64(arr, string_value(&locale)); +fn supported_locales_array(locales: f64, options: f64) -> f64 { + // SupportedLocales: when `options` is not undefined, `? ToObject(options)` + // (null → TypeError) then `? GetOption(options, "localeMatcher", …)` — so an + // invalid localeMatcher is a RangeError even though the matcher choice does + // not affect Perry's lookup result. + if !JSValue::from_bits(options.to_bits()).is_undefined() { + let options = coerce_options_reject_null(options); + let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + } + // BestAvailableLocale-filter the canonicalized request list: drop tags whose + // primary language Perry can't service (e.g. `zxx`), keeping order + dedup. + let requested = locales_from_value(locales); + let mut arr = js_array_alloc(0); + for locale in requested { + if is_available_locale(&locale) { + arr = js_array_push_f64(arr, string_value(&locale)); + } } js_nanbox_pointer(arr as i64) } -extern "C" fn supported_locales_of_thunk(_closure: *const ClosureHeader, locales: f64) -> f64 { - supported_locales_array(locales) +extern "C" fn supported_locales_of_thunk(_closure: *const ClosureHeader, rest: f64) -> f64 { + supported_locales_array(rest_arg(rest, 0), rest_arg(rest, 1)) } fn install_function( @@ -1432,13 +1728,16 @@ fn install_constructor( PropertyAttrs::new(false, false, false), ); + // `supportedLocalesOf(locales, options)` — `.length` is 1, but it reads a + // second `options` argument, so register it rest-style (all args collected) + // and pull both positionally. let supported = install_function( ctor as *mut ObjectHeader, "supportedLocalesOf", supported_locales_of_thunk as *const u8, + 0, 1, - 1, - false, + true, ); crate::closure::closure_set_dynamic_prop(ctor as usize, "supportedLocalesOf", supported); diff --git a/crates/perry-runtime/src/intl/date_collator.rs b/crates/perry-runtime/src/intl/date_collator.rs index e658321f09..8736e93129 100644 --- a/crates/perry-runtime/src/intl/date_collator.rs +++ b/crates/perry-runtime/src/intl/date_collator.rs @@ -342,7 +342,25 @@ pub(crate) fn swedish_collation_key(s: &str) -> Vec { .collect() } +/// Normalize to NFD so canonically-equivalent strings (e.g. `"ö"` precomposed +/// vs. `"ö"` decomposed) collate equal — the ECMA-402 requirement that +/// `Collator.compare` treats canonical equivalents as 0 (canonically-equivalent +/// -strings.js). Without `string-normalize` this is an identity passthrough, so +/// the precomposed/decomposed pair still compares unequal (best effort). +#[cfg(feature = "string-normalize")] +fn collation_normalize(s: &str) -> String { + use unicode_normalization::UnicodeNormalization; + s.nfd().collect() +} +#[cfg(not(feature = "string-normalize"))] +fn collation_normalize(s: &str) -> String { + s.to_string() +} + pub(crate) fn compare_strings(locale: &str, left: &str, right: &str) -> f64 { + let left = collation_normalize(left); + let right = collation_normalize(right); + let (left, right) = (left.as_str(), right.as_str()); let ordering = if locale == "sv" || locale.starts_with("sv-") { swedish_collation_key(left).cmp(&swedish_collation_key(right)) } else { @@ -373,9 +391,36 @@ pub(crate) extern "C" fn collator_bound_compare_thunk( collator_compare_object(obj, left, right) } +/// Strip the code points a UCA `ignorePunctuation` collator treats as ignorable +/// — whitespace and punctuation — so e.g. `compare("", " ")` and +/// `compare("", "*")` are 0 (compare/ignorePunctuation.js). +fn strip_ignorable_punctuation(s: &str) -> String { + s.chars() + .filter(|c| !c.is_whitespace() && !is_punctuation(*c)) + .collect() +} + +fn is_punctuation(c: char) -> bool { + // ASCII punctuation plus the common Unicode punctuation/symbol ranges that + // UCA marks variable; a pragmatic superset of what the parity tests exercise. + c.is_ascii_punctuation() + || matches!(c, + '\u{00A1}'..='\u{00BF}' + | '\u{2010}'..='\u{2027}' + | '\u{2030}'..='\u{205E}' + | '\u{3000}'..='\u{303F}') +} + pub(crate) fn collator_compare_object(obj: *const ObjectHeader, left: f64, right: f64) -> f64 { let locale = get_string_field(obj, KEY_LOCALE).unwrap_or_else(|| "en-US".to_string()); - compare_strings(&locale, &value_to_string(left), &value_to_string(right)) + let ignore_punct = + get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE; + let (mut l, mut r) = (value_to_string(left), value_to_string(right)); + if ignore_punct { + l = strip_ignorable_punctuation(&l); + r = strip_ignorable_punctuation(&r); + } + compare_strings(&locale, &l, &r) } pub(crate) extern "C" fn collator_resolved_options_thunk(_closure: *const ClosureHeader) -> f64 { @@ -391,16 +436,49 @@ pub(crate) extern "C" fn collator_bound_resolved_options_thunk( } pub(crate) fn collator_resolved_options_object(obj: *const ObjectHeader) -> f64 { - let out = js_object_alloc(0, 6); + let out = js_object_alloc(0, 7); + // Property insertion order matches ECMA-402 (resolvedOptions/order.js): + // locale, usage, sensitivity, ignorePunctuation, collation, numeric, caseFirst. set_field( out, "locale", string_value(&get_string_field(obj, KEY_LOCALE).unwrap_or_else(|| "en-US".to_string())), ); - set_field(out, "usage", string_value("sort")); - set_field(out, "sensitivity", string_value("variant")); - set_field(out, "ignorePunctuation", bool_value(false)); - set_field(out, "numeric", bool_value(false)); - set_field(out, "caseFirst", string_value("false")); + set_field( + out, + "usage", + string_value(&get_string_field(obj, KEY_COL_USAGE).unwrap_or_else(|| "sort".to_string())), + ); + set_field( + out, + "sensitivity", + string_value( + &get_string_field(obj, KEY_COL_SENSITIVITY).unwrap_or_else(|| "variant".to_string()), + ), + ); + set_field( + out, + "ignorePunctuation", + bool_value(get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE), + ); + set_field( + out, + "collation", + string_value( + &get_string_field(obj, KEY_COL_COLLATION).unwrap_or_else(|| "default".to_string()), + ), + ); + set_field( + out, + "numeric", + bool_value(get_field(obj, KEY_COL_NUMERIC).to_bits() == crate::value::TAG_TRUE), + ); + set_field( + out, + "caseFirst", + string_value( + &get_string_field(obj, KEY_COL_CASE_FIRST).unwrap_or_else(|| "false".to_string()), + ), + ); js_nanbox_pointer(out as i64) } diff --git a/crates/perry-runtime/src/intl/list_relative_plural.rs b/crates/perry-runtime/src/intl/list_relative_plural.rs index 1878d41e94..cbec0c76de 100644 --- a/crates/perry-runtime/src/intl/list_relative_plural.rs +++ b/crates/perry-runtime/src/intl/list_relative_plural.rs @@ -269,8 +269,9 @@ const RTF_SINGULAR_UNITS: &[&str] = &[ /// Normalize a RelativeTimeFormat unit argument (singular or plural) to its /// singular sanctioned form, or `None` if unrecognized (caller raises RangeError). pub(crate) fn rtf_singular_unit(unit: &str) -> Option<&'static str> { - let lower = unit.to_ascii_lowercase(); - let candidate = lower.strip_suffix('s').unwrap_or(&lower); + // The sanctioned units are case-sensitive (ECMA-402 IsSanctionedSingularUnit): + // `"second"`/`"seconds"` are accepted, `"SECOND"` is not (format/unit-invalid.js). + let candidate = unit.strip_suffix('s').unwrap_or(unit); RTF_SINGULAR_UNITS.iter().copied().find(|u| *u == candidate) } @@ -300,18 +301,50 @@ pub(crate) fn rtf_parts(value: f64, unit: &str) -> Vec<(&'static str, String)> { parts } -pub(crate) fn rtf_instance_parts(value: f64, unit_arg: f64) -> Vec<(&'static str, String)> { - let number = JSValue::from_bits(value.to_bits()).to_number(); +/// Shared steps of `format`/`formatToParts`: `value = ? ToNumber(value)` (a +/// Symbol throws TypeError; an object's `valueOf` is honoured), then +/// `unit = ? ToString(unit)`, then the RangeError guards for a non-finite value +/// or an unsanctioned unit. Returns the rendered parts together with the +/// resolved singular `unit` (the `[[Unit]]` field formatToParts attaches). +pub(crate) fn rtf_instance_parts_and_unit( + value: f64, + unit_arg: f64, +) -> (Vec<(&'static str, String)>, &'static str) { + // Full ToNumber (not `JSValue::to_number`, which returns NaN for objects and + // doesn't reject Symbols): valueOf is invoked, and a Symbol value throws the + // expected TypeError *before* the finite-ness RangeError (format/value-symbol.js). + let number = crate::builtins::js_number_coerce(value); + let unit_str = value_to_string(unit_arg); if !number.is_finite() { throw_range_error("Value need to be finite number for Intl.RelativeTimeFormat.format()"); } - let unit_str = value_to_string(unit_arg); let Some(unit) = rtf_singular_unit(&unit_str) else { throw_range_error(&format!( "Value {unit_str} out of range for Intl.RelativeTimeFormat.format() unit" )); }; - rtf_parts(number, unit) + (rtf_parts(number, unit), unit) +} + +pub(crate) fn rtf_instance_parts(value: f64, unit_arg: f64) -> Vec<(&'static str, String)> { + rtf_instance_parts_and_unit(value, unit_arg).0 +} + +/// Build the `formatToParts` array, attaching the `[[Unit]]` field to every part +/// derived from the formatted number (i.e. every non-`"literal"` part) per +/// FormatRelativeTimeToParts (formatToParts/result-type.js). +fn rtf_parts_to_js_array(parts: &[(&'static str, String)], unit: &str) -> f64 { + let mut arr = js_array_alloc(parts.len() as u32); + for (ty, val) in parts { + let obj = js_object_alloc(0, 3); + set_field(obj, "type", string_value(ty)); + set_field(obj, "value", string_value(val)); + if *ty != "literal" { + set_field(obj, "unit", string_value(unit)); + } + arr = js_array_push_f64(arr, js_nanbox_pointer(obj as i64)); + } + js_nanbox_pointer(arr as i64) } pub(crate) extern "C" fn rtf_format_thunk( @@ -348,7 +381,8 @@ pub(crate) extern "C" fn rtf_to_parts_thunk( unit: f64, ) -> f64 { let _obj = this_intl_object("formatToParts", KIND_RELATIVE_TIME); - parts_to_js_array(&rtf_instance_parts(value, unit)) + let (parts, unit) = rtf_instance_parts_and_unit(value, unit); + rtf_parts_to_js_array(&parts, unit) } pub(crate) extern "C" fn rtf_bound_to_parts_thunk( @@ -357,7 +391,8 @@ pub(crate) extern "C" fn rtf_bound_to_parts_thunk( unit: f64, ) -> f64 { let _obj = captured_intl_object(closure, "formatToParts", KIND_RELATIVE_TIME); - parts_to_js_array(&rtf_instance_parts(value, unit)) + let (parts, unit) = rtf_instance_parts_and_unit(value, unit); + rtf_parts_to_js_array(&parts, unit) } pub(crate) fn rtf_resolved_options_object(obj: *const ObjectHeader) -> f64 { @@ -469,8 +504,17 @@ pub(crate) extern "C" fn plural_rules_bound_select_range_thunk( } pub(crate) fn plural_select_range(start: f64, end: f64) -> f64 { - let s = JSValue::from_bits(start.to_bits()).to_number(); - let e = JSValue::from_bits(end.to_bits()).to_number(); + // PluralRules.prototype.selectRange(start, end): a `undefined` endpoint is a + // TypeError (step 3), evaluated *before* the `? ToNumber` coercions — and + // ToNumber itself throws TypeError for a Symbol (selectRange/ + // undefined-arguments-throws.js, argument-tonumber-throws.js). + if JSValue::from_bits(start.to_bits()).is_undefined() + || JSValue::from_bits(end.to_bits()).is_undefined() + { + throw_type_error("Intl.PluralRules.prototype.selectRange: start and end must be defined"); + } + let s = crate::builtins::js_number_coerce(start); + let e = crate::builtins::js_number_coerce(end); if s.is_nan() || e.is_nan() { throw_range_error("Invalid values for Intl.PluralRules.selectRange()"); } @@ -491,7 +535,19 @@ pub(crate) fn plural_rules_resolved_options_object(obj: *const ObjectHeader) -> "type", string_value(if is_ordinal { "ordinal" } else { "cardinal" }), ); - set_field(out, "notation", string_value("standard")); + let notation = + get_string_field(obj, KEY_PR_NOTATION).unwrap_or_else(|| "standard".to_string()); + set_field(out, "notation", string_value(¬ation)); + // `compactDisplay` surfaces only when notation is "compact". + if notation == "compact" { + set_field( + out, + "compactDisplay", + string_value( + &get_string_field(obj, KEY_PR_COMPACT_DISPLAY).unwrap_or_else(|| "short".to_string()), + ), + ); + } set_field( out, "minimumIntegerDigits", From b02914cb4cd1bb473d66199df63caf929fc83fb6 Mon Sep 17 00:00:00 2001 From: Ralph Date: Sat, 27 Jun 2026 01:50:15 -0700 Subject: [PATCH 2/2] =?UTF-8?q?fix(intl):=20#5585=20=E2=80=94=20address=20?= =?UTF-8?q?CodeRabbit=20review=20(fmt=20+=20correctness)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cargo fmt (the lint check failed on formatting). - Route the CanonicalizeLocaleList branches through `canonicalize_language_tag` (feature-aware, matches intl/locales.rs) instead of `canonical_locale`. - Skip absent indices via HasProperty when walking array-like locale objects (`{length:3,0:"en"}` → `["en"]`, not a TypeError on the holes). - `unicode_extension_keyword` stops at the `x` private-use singleton so `en-x-u-kn` is not misread as a `kn` Unicode extension. - Collator `collation` option rejects the reserved `standard`/`search` values (and malformed types); a valid option now wins over the `-u-co-` keyword. - `supportedLocalesOf` runs CanonicalizeLocaleList(locales) before reading `options` (locale errors/side-effects precede option validation). - Collator compare normalizes with NFC (not NFD) so the Swedish fast path keeps its precomposed å/ä/ö; `ignorePunctuation` stripping uses an explicit punctuation set instead of whole Latin-1 ranges (no longer eats ª/µ/º/¹²³/¼). - RTF `format`/`formatToParts` and PluralRules `selectRange` use a ToNumber that rejects BigInt with TypeError (`format(1n,"day")` throws). test262 intl402 across the six dirs: 307/387 pass (79%), up from 297 — 10 more cases, no regressions. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/perry-runtime/src/intl.rs | 131 +++++++++++++----- .../perry-runtime/src/intl/date_collator.rs | 21 +-- .../src/intl/list_relative_plural.rs | 31 +++-- 3 files changed, 127 insertions(+), 56 deletions(-) diff --git a/crates/perry-runtime/src/intl.rs b/crates/perry-runtime/src/intl.rs index 325a117b6d..abf76367b8 100644 --- a/crates/perry-runtime/src/intl.rs +++ b/crates/perry-runtime/src/intl.rs @@ -549,6 +549,14 @@ fn canonicalize_language_tag(tag: &str) -> Option { } } +/// `HasProperty(O, ToString(index))` — true when the integer-indexed property is +/// present (own or inherited). Used to skip holes/absent indices in +/// CanonicalizeLocaleList's array/array-like walk. +fn js_has_index(obj: f64, index: u32) -> bool { + let key = string_value(&index.to_string()); + crate::object::js_object_has_property(obj, key).to_bits() == crate::value::TAG_TRUE +} + /// CanonicalizeLocaleList element handler: a present element must be a String or /// an Object (an `Intl.Locale` or anything ToString-able), else `TypeError`; the /// resulting tag is canonicalized (`RangeError` if structurally invalid) and @@ -563,7 +571,7 @@ fn push_locale_element(out: &mut Vec, value: f64) { // undefined / null / boolean / number / Symbol element → TypeError. throw_type_error("locale must be a String or Object"); }; - let Some(canonical) = canonical_locale(&tag) else { + let Some(canonical) = canonicalize_language_tag(&tag) else { throw_invalid_language_tag(&tag); }; if !out.iter().any(|existing| existing == &canonical) { @@ -584,7 +592,7 @@ fn locales_from_value(locales: f64) -> Vec { // A String argument is treated as a single-element list (not iterated by char). if js.is_any_string() { let tag = string_from_string_value(locales).unwrap_or_default(); - let Some(canonical) = canonical_locale(&tag) else { + let Some(canonical) = canonicalize_language_tag(&tag) else { throw_invalid_language_tag(&tag); }; return vec![canonical]; @@ -611,6 +619,11 @@ fn locales_from_value(locales: f64) -> Vec { }; let mut out = Vec::with_capacity(len as usize); for i in 0..len { + // Skip absent indices (`HasProperty` is false) — e.g. + // `{ length: 3, 0: "en" }` yields just `["en"]`, never `undefined`. + if !js_has_index(locales, i) { + continue; + } push_locale_element(&mut out, get_field(obj, &i.to_string())); } return out; @@ -654,9 +667,14 @@ fn unicode_extension_keyword(locale: &str, key: &str) -> Option { let lower = locale.to_ascii_lowercase(); let key = key.to_ascii_lowercase(); let mut iter = lower.split('-'); - // Advance to the `u` singleton. + // Advance to the `u` singleton. A `x` singleton starts the private-use + // sequence (which must come last); a `u` inside it — e.g. `en-x-u-kn` — is + // private data, not a Unicode extension, so stop scanning there. let mut in_u = false; - while let Some(p) = iter.next() { + for p in iter.by_ref() { + if p == "x" { + return None; + } if p == "u" { in_u = true; break; @@ -1260,27 +1278,34 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // (constructor-options-throwing-getters / resolvedOptions order.js). let options = coerce_options_reject_null(options); let usage = enum_option_strict(options, "usage", &["sort", "search"], "sort"); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); - // `collation` is a free-form `type` string (RangeError if malformed); - // it has no CLDR effect here, so it resolves to "default". - if let Some(collation) = get_option_string_coerced(options, "collation") { - if !is_well_formed_numbering_system(&collation) { + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + // `collation` is a `type` string: malformed, or the reserved `standard` + // /`search` values, are a RangeError (the latter are only valid as a + // `usage` selector, never an explicit collation). A valid value wins + // over any `-u-co-` keyword; absent ⇒ fall back to the extension. + let collation_opt = get_option_string_coerced(options, "collation").map(|v| { + if !is_well_formed_numbering_system(&v) || v == "standard" || v == "search" { throw_range_error(&format!( - "Value {collation} out of range for Intl options property collation" + "Value {v} out of range for Intl options property collation" )); } - } + v + }); let numeric_opt = get_bool_option(options, "numeric"); - let case_first_opt = - get_option_string_coerced(options, "caseFirst").map(|v| { - if ["upper", "lower", "false"].contains(&v.as_str()) { - v - } else { - throw_range_error(&format!( - "Value {v} out of range for Intl options property caseFirst" - )) - } - }); + let case_first_opt = get_option_string_coerced(options, "caseFirst").map(|v| { + if ["upper", "lower", "false"].contains(&v.as_str()) { + v + } else { + throw_range_error(&format!( + "Value {v} out of range for Intl options property caseFirst" + )) + } + }); let sensitivity = enum_option_strict( options, "sensitivity", @@ -1291,20 +1316,21 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // ResolveLocale: when an option is absent, fall back to the matching // Unicode (`-u-`) extension keyword in the resolved locale — `kn` // (numeric, value-less ⇒ true) and `kf` (caseFirst). - let numeric = numeric_opt.unwrap_or_else(|| { - match unicode_extension_keyword(&locale, "kn") { + let numeric = + numeric_opt.unwrap_or_else(|| match unicode_extension_keyword(&locale, "kn") { Some(v) => v != "false", None => false, - } - }); + }); let case_first = case_first_opt.unwrap_or_else(|| { unicode_extension_keyword(&locale, "kf") .filter(|v| ["upper", "lower", "false"].contains(&v.as_str())) .unwrap_or_else(|| "false".to_string()) }); - let collation = unicode_extension_keyword(&locale, "co") - .filter(|v| !v.is_empty() && v != "standard" && v != "search") - .unwrap_or_else(|| "default".to_string()); + let collation = collation_opt.unwrap_or_else(|| { + unicode_extension_keyword(&locale, "co") + .filter(|v| !v.is_empty() && v != "standard" && v != "search") + .unwrap_or_else(|| "default".to_string()) + }); set_internal_field(obj, KEY_COL_USAGE, string_value(&usage)); set_internal_field(obj, KEY_COL_SENSITIVITY, string_value(&sensitivity)); set_internal_field(obj, KEY_COL_IGNORE_PUNCT, bool_value(ignore_punct)); @@ -1328,7 +1354,12 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // `? ToObject(options)` (null → TypeError), then GetOption in order: // localeMatcher, granularity (options-order.js / options-null.js). let options = coerce_options_reject_null(options); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); let granularity = normalize_granularity(get_option_string_coerced(options, "granularity")); set_internal_field(obj, KEY_GRANULARITY, string_value(&granularity)); @@ -1350,7 +1381,12 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // TypeError), then GetOption: localeMatcher, type, style // (options-getoptionsobject.js / options-order.js). let options = get_options_object(options); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); let list_type = enum_option_strict( options, "type", @@ -1383,7 +1419,12 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // `? ToObject(options)` (null → TypeError), then GetOption in order: // localeMatcher, numberingSystem, style, numeric (options-order.js). let options = coerce_options_reject_null(options); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); if let Some(ns) = get_option_string_coerced(options, "numberingSystem") { if !is_well_formed_numbering_system(&ns) { throw_range_error(&format!( @@ -1416,7 +1457,12 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option // (minimumIntegerDigits, min/maxFractionDigits, min/maxSignificantDigits, // roundingIncrement, roundingMode, roundingPriority, trailingZeroDisplay). let options = get_options_object(options); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); let pr_type = enum_option_strict(options, "type", &["cardinal", "ordinal"], "cardinal"); set_internal_field(obj, KEY_TYPE, string_value(&pr_type)); let notation = enum_option_strict( @@ -1573,17 +1619,26 @@ extern "C" fn plural_rules_constructor_thunk(closure: *const ClosureHeader, rest } fn supported_locales_array(locales: f64, options: f64) -> f64 { - // SupportedLocales: when `options` is not undefined, `? ToObject(options)` - // (null → TypeError) then `? GetOption(options, "localeMatcher", …)` — so an - // invalid localeMatcher is a RangeError even though the matcher choice does - // not affect Perry's lookup result. + // `supportedLocalesOf(locales, options)`: + // 1. requestedLocales = ? CanonicalizeLocaleList(locales) ← runs FIRST, + // so a malformed locale errors before `options` is touched. + // 2. SupportedLocales(..., options): when `options` is not undefined, + // `? ToObject(options)` (null → TypeError) then + // `? GetOption(options, "localeMatcher", …)` — an invalid localeMatcher + // is a RangeError even though the matcher choice does not affect Perry's + // lookup result. + let requested = locales_from_value(locales); if !JSValue::from_bits(options.to_bits()).is_undefined() { let options = coerce_options_reject_null(options); - let _ = enum_option_strict(options, "localeMatcher", &["lookup", "best fit"], "best fit"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); } // BestAvailableLocale-filter the canonicalized request list: drop tags whose // primary language Perry can't service (e.g. `zxx`), keeping order + dedup. - let requested = locales_from_value(locales); let mut arr = js_array_alloc(0); for locale in requested { if is_available_locale(&locale) { diff --git a/crates/perry-runtime/src/intl/date_collator.rs b/crates/perry-runtime/src/intl/date_collator.rs index 8736e93129..ec3d156f28 100644 --- a/crates/perry-runtime/src/intl/date_collator.rs +++ b/crates/perry-runtime/src/intl/date_collator.rs @@ -350,7 +350,9 @@ pub(crate) fn swedish_collation_key(s: &str) -> Vec { #[cfg(feature = "string-normalize")] fn collation_normalize(s: &str) -> String { use unicode_normalization::UnicodeNormalization; - s.nfd().collect() + // NFC (composition), not NFD: it makes canonical equivalents equal while + // keeping precomposed `å/ä/ö` intact for the Swedish fast path below. + s.nfc().collect() } #[cfg(not(feature = "string-normalize"))] fn collation_normalize(s: &str) -> String { @@ -401,20 +403,23 @@ fn strip_ignorable_punctuation(s: &str) -> String { } fn is_punctuation(c: char) -> bool { - // ASCII punctuation plus the common Unicode punctuation/symbol ranges that - // UCA marks variable; a pragmatic superset of what the parity tests exercise. + // ASCII punctuation plus an explicit set of Unicode punctuation code points, + // deliberately NOT whole Latin-1 ranges — those contain letters/numbers + // (`ª` U+00AA, `µ` U+00B5, `º` U+00BA, the `¹²³` superscripts, `¼½¾` + // fractions) that must not be stripped or distinct strings would compare + // equal. The General Punctuation block (U+2000–U+206F) and CJK punctuation + // (U+3000–U+303F) are all punctuation/spaces and are safe as ranges. c.is_ascii_punctuation() || matches!(c, - '\u{00A1}'..='\u{00BF}' - | '\u{2010}'..='\u{2027}' - | '\u{2030}'..='\u{205E}' + '\u{00A1}' | '\u{00A7}' | '\u{00AB}' | '\u{00B6}' | '\u{00B7}' + | '\u{00BB}' | '\u{00BF}' + | '\u{2000}'..='\u{206F}' | '\u{3000}'..='\u{303F}') } pub(crate) fn collator_compare_object(obj: *const ObjectHeader, left: f64, right: f64) -> f64 { let locale = get_string_field(obj, KEY_LOCALE).unwrap_or_else(|| "en-US".to_string()); - let ignore_punct = - get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE; + let ignore_punct = get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE; let (mut l, mut r) = (value_to_string(left), value_to_string(right)); if ignore_punct { l = strip_ignorable_punctuation(&l); diff --git a/crates/perry-runtime/src/intl/list_relative_plural.rs b/crates/perry-runtime/src/intl/list_relative_plural.rs index cbec0c76de..ed354223b7 100644 --- a/crates/perry-runtime/src/intl/list_relative_plural.rs +++ b/crates/perry-runtime/src/intl/list_relative_plural.rs @@ -301,8 +301,20 @@ pub(crate) fn rtf_parts(value: f64, unit: &str) -> Vec<(&'static str, String)> { parts } +/// `ToNumber(value)` that rejects BigInt with a TypeError, matching the +/// ECMA-262 abstract operation. `js_number_coerce` alone converts `1n` → `1` +/// (for `Number(1n)`), but `Intl` `format`/`select*` go through ToNumber, so +/// `format(1n, "day")` must throw. A Symbol still throws inside `js_number_coerce`, +/// and an object's `valueOf` is honoured there. +pub(crate) fn to_number_reject_bigint(value: f64) -> f64 { + if JSValue::from_bits(value.to_bits()).is_bigint() { + throw_type_error("Cannot convert a BigInt value to a number"); + } + crate::builtins::js_number_coerce(value) +} + /// Shared steps of `format`/`formatToParts`: `value = ? ToNumber(value)` (a -/// Symbol throws TypeError; an object's `valueOf` is honoured), then +/// Symbol or BigInt throws TypeError; an object's `valueOf` is honoured), then /// `unit = ? ToString(unit)`, then the RangeError guards for a non-finite value /// or an unsanctioned unit. Returns the rendered parts together with the /// resolved singular `unit` (the `[[Unit]]` field formatToParts attaches). @@ -310,10 +322,9 @@ pub(crate) fn rtf_instance_parts_and_unit( value: f64, unit_arg: f64, ) -> (Vec<(&'static str, String)>, &'static str) { - // Full ToNumber (not `JSValue::to_number`, which returns NaN for objects and - // doesn't reject Symbols): valueOf is invoked, and a Symbol value throws the - // expected TypeError *before* the finite-ness RangeError (format/value-symbol.js). - let number = crate::builtins::js_number_coerce(value); + // ToNumber: a Symbol/BigInt value throws TypeError *before* the finite-ness + // RangeError (format/value-symbol.js); an object's valueOf is invoked. + let number = to_number_reject_bigint(value); let unit_str = value_to_string(unit_arg); if !number.is_finite() { throw_range_error("Value need to be finite number for Intl.RelativeTimeFormat.format()"); @@ -513,8 +524,8 @@ pub(crate) fn plural_select_range(start: f64, end: f64) -> f64 { { throw_type_error("Intl.PluralRules.prototype.selectRange: start and end must be defined"); } - let s = crate::builtins::js_number_coerce(start); - let e = crate::builtins::js_number_coerce(end); + let s = to_number_reject_bigint(start); + let e = to_number_reject_bigint(end); if s.is_nan() || e.is_nan() { throw_range_error("Invalid values for Intl.PluralRules.selectRange()"); } @@ -535,8 +546,7 @@ pub(crate) fn plural_rules_resolved_options_object(obj: *const ObjectHeader) -> "type", string_value(if is_ordinal { "ordinal" } else { "cardinal" }), ); - let notation = - get_string_field(obj, KEY_PR_NOTATION).unwrap_or_else(|| "standard".to_string()); + let notation = get_string_field(obj, KEY_PR_NOTATION).unwrap_or_else(|| "standard".to_string()); set_field(out, "notation", string_value(¬ation)); // `compactDisplay` surfaces only when notation is "compact". if notation == "compact" { @@ -544,7 +554,8 @@ pub(crate) fn plural_rules_resolved_options_object(obj: *const ObjectHeader) -> out, "compactDisplay", string_value( - &get_string_field(obj, KEY_PR_COMPACT_DISPLAY).unwrap_or_else(|| "short".to_string()), + &get_string_field(obj, KEY_PR_COMPACT_DISPLAY) + .unwrap_or_else(|| "short".to_string()), ), ); }