diff --git a/crates/perry-runtime/src/intl.rs b/crates/perry-runtime/src/intl.rs index 89e21b740..abf76367b 100644 --- a/crates/perry-runtime/src/intl.rs +++ b/crates/perry-runtime/src/intl.rs @@ -154,6 +154,14 @@ const KEY_NF_TRAILING_ZERO: &str = "__intlNfTrailingZero"; // `format` getter reads it from here so user mutation/deletion of the public // property can't corrupt what the accessor returns. const KEY_NF_BOUND_FORMAT: &str = "__intlNfBoundFormat"; +const KEY_COL_USAGE: &str = "__intlColUsage"; +const KEY_COL_SENSITIVITY: &str = "__intlColSensitivity"; +const KEY_COL_IGNORE_PUNCT: &str = "__intlColIgnorePunct"; +const KEY_COL_COLLATION: &str = "__intlColCollation"; +const KEY_COL_NUMERIC: &str = "__intlColNumeric"; +const KEY_COL_CASE_FIRST: &str = "__intlColCaseFirst"; +const KEY_PR_NOTATION: &str = "__intlPrNotation"; +const KEY_PR_COMPACT_DISPLAY: &str = "__intlPrCompactDisplay"; fn undefined() -> f64 { f64::from_bits(crate::value::TAG_UNDEFINED) @@ -541,34 +549,108 @@ fn canonicalize_language_tag(tag: &str) -> Option { } } +/// `HasProperty(O, ToString(index))` — true when the integer-indexed property is +/// present (own or inherited). Used to skip holes/absent indices in +/// CanonicalizeLocaleList's array/array-like walk. +fn js_has_index(obj: f64, index: u32) -> bool { + let key = string_value(&index.to_string()); + crate::object::js_object_has_property(obj, key).to_bits() == crate::value::TAG_TRUE +} + +/// CanonicalizeLocaleList element handler: a present element must be a String or +/// an Object (an `Intl.Locale` or anything ToString-able), else `TypeError`; the +/// resulting tag is canonicalized (`RangeError` if structurally invalid) and +/// pushed if not already present. +fn push_locale_element(out: &mut Vec, value: f64) { + let jv = JSValue::from_bits(value.to_bits()); + let tag = if jv.is_any_string() { + string_from_string_value(value).unwrap_or_default() + } else if object_ptr_from_value(value).is_some() { + value_to_string(value) + } else { + // undefined / null / boolean / number / Symbol element → TypeError. + throw_type_error("locale must be a String or Object"); + }; + let Some(canonical) = canonicalize_language_tag(&tag) else { + throw_invalid_language_tag(&tag); + }; + if !out.iter().any(|existing| existing == &canonical) { + out.push(canonical); + } +} + fn locales_from_value(locales: f64) -> Vec { let js = JSValue::from_bits(locales.to_bits()); - if js.is_undefined() || js.is_null() { + // CanonicalizeLocaleList(undefined) is the empty list; `null` fails ToObject + // with a TypeError (everything else is a String or coerces via ToObject). + if js.is_undefined() { return Vec::new(); } + if js.is_null() { + throw_type_error("Cannot convert undefined or null to object"); + } + // A String argument is treated as a single-element list (not iterated by char). + if js.is_any_string() { + let tag = string_from_string_value(locales).unwrap_or_default(); + let Some(canonical) = canonicalize_language_tag(&tag) else { + throw_invalid_language_tag(&tag); + }; + return vec![canonical]; + } if let Some(arr) = array_ptr_from_value(locales) { let len = js_array_length(arr); let mut out = Vec::with_capacity(len as usize); for i in 0..len { - let value = js_array_get_f64(arr, i); - if let Some(tag) = string_from_string_value(value) { - let Some(canonical) = canonical_locale(&tag) else { - throw_invalid_language_tag(&tag); - }; - out.push(canonical); - } + push_locale_element(&mut out, js_array_get_f64(arr, i)); } return out; } - if let Some(tag) = string_from_string_value(locales) { - let Some(canonical) = canonical_locale(&tag) else { - throw_invalid_language_tag(&tag); + // CanonicalizeLocaleList on a generic array-like Object: iterate `O[0..length]` + // (e.g. `{ 0: "DE", length: 1 }` → `["de"]`). + if let Some(obj) = object_ptr_from_value(locales) { + // `length = ? ToLength(? Get(O, "length"))`: a throwing `length` getter or + // ToNumber step (Symbol / abrupt valueOf/toString) propagates here. + let len_raw = get_field(obj, "length"); + let len_num = crate::builtins::js_number_coerce(len_raw); + let len = if len_num.is_finite() && len_num > 0.0 { + len_num as u32 + } else { + 0 }; - return vec![canonical]; + let mut out = Vec::with_capacity(len as usize); + for i in 0..len { + // Skip absent indices (`HasProperty` is false) — e.g. + // `{ length: 3, 0: "en" }` yields just `["en"]`, never `undefined`. + if !js_has_index(locales, i) { + continue; + } + push_locale_element(&mut out, get_field(obj, &i.to_string())); + } + return out; } + // Other primitives (number/boolean/Symbol/BigInt): ToObject yields a wrapper + // with length 0 — an empty list, no throw. Vec::new() } +/// BestAvailableLocale (lookup) — a requested canonical locale is "supported" +/// when its primary language subtag is one Perry's deterministic formatters can +/// service. Perry carries no CLDR locale database, so this is a curated set of +/// common CLDR languages rather than a data lookup: it is enough to distinguish +/// real languages (`en`, `de`, `zh`, …) from the "no linguistic content" tag +/// `zxx` and other unsupported primaries that `supportedLocalesOf` must drop. +fn is_available_locale(canonical: &str) -> bool { + let primary = canonical.split(['-', '_']).next().unwrap_or(canonical); + const AVAILABLE_LANGUAGES: &[&str] = &[ + "af", "am", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", + "es", "et", "eu", "fa", "fi", "fil", "fr", "ga", "gl", "gu", "he", "hi", "hr", "hu", "hy", + "id", "is", "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", + "mn", "mr", "ms", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "si", "sk", + "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", "tr", "uk", "ur", "uz", "vi", "zh", "zu", + ]; + AVAILABLE_LANGUAGES.contains(&primary) +} + fn locale_or_default(locales: f64) -> String { locales_from_value(locales) .into_iter() @@ -576,6 +658,52 @@ fn locale_or_default(locales: f64) -> String { .unwrap_or_else(|| "en-US".to_string()) } +/// Look up a Unicode (`-u-`) extension keyword's value in a BCP-47 tag. Returns +/// `Some(value)` if the 2-letter `key` is present (the value is the `-`-joined +/// run of type subtags after it, or `""` for a value-less boolean key like +/// `-u-kn`), else `None`. Case-insensitive. Used to resolve `kn`/`kf`/`co` for +/// Collator when the corresponding option is absent (numeric-and-caseFirst.js). +fn unicode_extension_keyword(locale: &str, key: &str) -> Option { + let lower = locale.to_ascii_lowercase(); + let key = key.to_ascii_lowercase(); + let mut iter = lower.split('-'); + // Advance to the `u` singleton. A `x` singleton starts the private-use + // sequence (which must come last); a `u` inside it — e.g. `en-x-u-kn` — is + // private data, not a Unicode extension, so stop scanning there. + let mut in_u = false; + for p in iter.by_ref() { + if p == "x" { + return None; + } + if p == "u" { + in_u = true; + break; + } + } + if !in_u { + return None; + } + let mut found = false; + let mut value: Vec<&str> = Vec::new(); + for p in iter { + if p.len() == 1 { + // Next singleton ends the `u` extension. + break; + } + if p.len() == 2 && p.chars().all(|c| c.is_ascii_alphanumeric()) { + if found { + break; // reached the next keyword + } + if p == key { + found = true; + } + } else if found { + value.push(p); + } + } + found.then(|| value.join("-")) +} + fn rest_arg(rest: f64, index: u32) -> f64 { let Some(arr) = array_ptr_from_value(rest) else { return undefined(); @@ -724,6 +852,74 @@ fn enum_option(options: f64, key: &str, allowed: &[&str], default: &str) -> Stri } } +/// `GetOption(options, key, "string", ...)` with full `ToString` coercion: only +/// `undefined` selects the default. `null`, numbers, booleans, etc. are coerced +/// via `ToString` (so `null` → `"null"`, never the absent path), and a Symbol +/// throws `TypeError` (ToString of a Symbol is a TypeError). This is the strict +/// spec behavior; `get_option_string` instead treats `null` as absent, which the +/// `options-*-invalid` value-validation tests reject. +fn get_option_string_coerced(options: f64, key: &str) -> Option { + let raw = get_option_value(options, key); + let jv = JSValue::from_bits(raw.to_bits()); + if jv.is_undefined() { + None + } else if jv.is_any_string() { + string_from_string_value(raw) + } else if unsafe { crate::symbol::js_is_symbol(raw) } != 0 { + throw_type_error(&format!( + "Cannot convert a Symbol value to a string for Intl options property {key}" + )); + } else { + Some(value_to_string(raw)) + } +} + +/// `GetOption` with an enumerated value set, using strict `ToString` coercion +/// (see [`get_option_string_coerced`]): an out-of-range value (including a +/// `ToString`-coerced `null` / number) is a `RangeError`; absent → `default`. +fn enum_option_strict(options: f64, key: &str, allowed: &[&str], default: &str) -> String { + match get_option_string_coerced(options, key) { + None => default.to_string(), + Some(value) => { + if allowed.contains(&value.as_str()) { + value + } else { + throw_range_error(&format!( + "Value {value} out of range for Intl options property {key}" + )) + } + } + } +} + +/// `GetOptionsObject(options)`: `undefined` yields an empty bag (reported as +/// `undefined`, which the option readers treat as "every key absent"); an Object +/// passes through unchanged; any other value (including `null`, primitives, and +/// BigInt) throws `TypeError`. Used by the constructors whose spec step is +/// `GetOptionsObject` (ListFormat, Segmenter, PluralRules, …). +fn get_options_object(options: f64) -> f64 { + let jv = JSValue::from_bits(options.to_bits()); + if jv.is_undefined() { + return options; + } + if object_ptr_from_value(options).is_some() { + return options; + } + throw_type_error("Cannot convert undefined or null to object"); +} + +/// `CoerceOptionsToObject(options)` partial: `undefined` stays an empty bag and +/// `null` throws `TypeError` (`ToObject(null)`). Primitives are *not* boxed here +/// — Perry reads option keys directly off Objects, so a primitive simply yields +/// every-key-absent — but `null` must still reject. Used by the constructors +/// whose spec step is `ToObject` (RelativeTimeFormat, Collator, …). +fn coerce_options_reject_null(options: f64) -> f64 { + if JSValue::from_bits(options.to_bits()).is_null() { + throw_type_error("Cannot convert undefined or null to object"); + } + options +} + /// GetBooleanOption(options, key): `undefined` → `None`, otherwise ToBoolean. fn get_bool_option(options: f64, key: &str) -> Option { let value = get_option_value(options, key); @@ -1076,6 +1272,71 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_COLLATOR => { + // InitializeCollator reads options via `? ToObject(options)` (null → + // TypeError) then GetOption in this exact order: usage, localeMatcher, + // collation, numeric, caseFirst, sensitivity, ignorePunctuation + // (constructor-options-throwing-getters / resolvedOptions order.js). + let options = coerce_options_reject_null(options); + let usage = enum_option_strict(options, "usage", &["sort", "search"], "sort"); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + // `collation` is a `type` string: malformed, or the reserved `standard` + // /`search` values, are a RangeError (the latter are only valid as a + // `usage` selector, never an explicit collation). A valid value wins + // over any `-u-co-` keyword; absent ⇒ fall back to the extension. + let collation_opt = get_option_string_coerced(options, "collation").map(|v| { + if !is_well_formed_numbering_system(&v) || v == "standard" || v == "search" { + throw_range_error(&format!( + "Value {v} out of range for Intl options property collation" + )); + } + v + }); + let numeric_opt = get_bool_option(options, "numeric"); + let case_first_opt = get_option_string_coerced(options, "caseFirst").map(|v| { + if ["upper", "lower", "false"].contains(&v.as_str()) { + v + } else { + throw_range_error(&format!( + "Value {v} out of range for Intl options property caseFirst" + )) + } + }); + let sensitivity = enum_option_strict( + options, + "sensitivity", + &["base", "accent", "case", "variant"], + "variant", + ); + let ignore_punct = get_bool_option(options, "ignorePunctuation").unwrap_or(false); + // ResolveLocale: when an option is absent, fall back to the matching + // Unicode (`-u-`) extension keyword in the resolved locale — `kn` + // (numeric, value-less ⇒ true) and `kf` (caseFirst). + let numeric = + numeric_opt.unwrap_or_else(|| match unicode_extension_keyword(&locale, "kn") { + Some(v) => v != "false", + None => false, + }); + let case_first = case_first_opt.unwrap_or_else(|| { + unicode_extension_keyword(&locale, "kf") + .filter(|v| ["upper", "lower", "false"].contains(&v.as_str())) + .unwrap_or_else(|| "false".to_string()) + }); + let collation = collation_opt.unwrap_or_else(|| { + unicode_extension_keyword(&locale, "co") + .filter(|v| !v.is_empty() && v != "standard" && v != "search") + .unwrap_or_else(|| "default".to_string()) + }); + set_internal_field(obj, KEY_COL_USAGE, string_value(&usage)); + set_internal_field(obj, KEY_COL_SENSITIVITY, string_value(&sensitivity)); + set_internal_field(obj, KEY_COL_IGNORE_PUNCT, bool_value(ignore_punct)); + set_internal_field(obj, KEY_COL_COLLATION, string_value(&collation)); + set_internal_field(obj, KEY_COL_NUMERIC, bool_value(numeric)); + set_internal_field(obj, KEY_COL_CASE_FIRST, string_value(&case_first)); install_bound_instance_function( obj, "compare", @@ -1090,7 +1351,17 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_SEGMENTER => { - let granularity = normalize_granularity(get_option_string(options, "granularity")); + // `? ToObject(options)` (null → TypeError), then GetOption in order: + // localeMatcher, granularity (options-order.js / options-null.js). + let options = coerce_options_reject_null(options); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + let granularity = + normalize_granularity(get_option_string_coerced(options, "granularity")); set_internal_field(obj, KEY_GRANULARITY, string_value(&granularity)); install_bound_instance_function( obj, @@ -1106,13 +1377,23 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_LIST_FORMAT => { - let list_type = enum_option( + // `? GetOptionsObject(options)` (any non-Object, non-undefined → + // TypeError), then GetOption: localeMatcher, type, style + // (options-getoptionsobject.js / options-order.js). + let options = get_options_object(options); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + let list_type = enum_option_strict( options, "type", &["conjunction", "disjunction", "unit"], "conjunction", ); - let style = enum_option(options, "style", &["long", "short", "narrow"], "long"); + let style = enum_option_strict(options, "style", &["long", "short", "narrow"], "long"); set_internal_field(obj, KEY_TYPE, string_value(&list_type)); set_internal_field(obj, KEY_LF_STYLE, string_value(&style)); install_bound_instance_function( @@ -1135,8 +1416,24 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_RELATIVE_TIME => { - let style = enum_option(options, "style", &["long", "short", "narrow"], "long"); - let numeric = enum_option(options, "numeric", &["always", "auto"], "always"); + // `? ToObject(options)` (null → TypeError), then GetOption in order: + // localeMatcher, numberingSystem, style, numeric (options-order.js). + let options = coerce_options_reject_null(options); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + if let Some(ns) = get_option_string_coerced(options, "numberingSystem") { + if !is_well_formed_numbering_system(&ns) { + throw_range_error(&format!( + "Value {ns} out of range for Intl options property numberingSystem" + )); + } + } + let style = enum_option_strict(options, "style", &["long", "short", "narrow"], "long"); + let numeric = enum_option_strict(options, "numeric", &["always", "auto"], "always"); set_internal_field(obj, KEY_RTF_STYLE, string_value(&style)); set_internal_field(obj, KEY_NUMERIC, string_value(&numeric)); install_bound_instance_function(obj, "format", rtf_bound_format_thunk as *const u8, 2); @@ -1154,21 +1451,54 @@ fn make_instance(closure: *const ClosureHeader, kind: &str, locales: f64, option ); } KIND_PLURAL_RULES => { - let pr_type = enum_option(options, "type", &["cardinal", "ordinal"], "cardinal"); + // `? GetOptionsObject(options)`, then GetOption in the exact order + // constructor-option-read-order.js asserts: localeMatcher, type, + // notation, compactDisplay, then SetNumberFormatDigitOptions + // (minimumIntegerDigits, min/maxFractionDigits, min/maxSignificantDigits, + // roundingIncrement, roundingMode, roundingPriority, trailingZeroDisplay). + let options = get_options_object(options); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + let pr_type = enum_option_strict(options, "type", &["cardinal", "ordinal"], "cardinal"); set_internal_field(obj, KEY_TYPE, string_value(&pr_type)); + let notation = enum_option_strict( + options, + "notation", + &["standard", "scientific", "engineering", "compact"], + "standard", + ); + let compact_display = + enum_option_strict(options, "compactDisplay", &["short", "long"], "short"); + set_internal_field(obj, KEY_PR_NOTATION, string_value(¬ation)); + if notation == "compact" { + set_internal_field(obj, KEY_PR_COMPACT_DISPLAY, string_value(&compact_display)); + } let min_int = get_option_number(options, "minimumIntegerDigits").unwrap_or(1.0); set_internal_field(obj, KEY_PR_MIN_INT, min_int); + let min_frac_read = get_option_number(options, "minimumFractionDigits"); + let max_frac_read = get_option_number(options, "maximumFractionDigits"); let min_sig = get_option_number(options, "minimumSignificantDigits"); let max_sig = get_option_number(options, "maximumSignificantDigits"); + // Trailing SetNumberFormatDigitOptions reads — observed for read-order + // parity even though Perry's plural selection ignores their values. + let _ = get_option_value(options, "roundingIncrement"); + let _ = get_option_value(options, "roundingMode"); + let _ = get_option_value(options, "roundingPriority"); + let _ = get_option_value(options, "trailingZeroDisplay"); if min_sig.is_some() || max_sig.is_some() { set_internal_field(obj, KEY_PR_USE_SIG, bool_value(true)); set_internal_field(obj, KEY_PR_MIN_SIG, min_sig.unwrap_or(1.0)); set_internal_field(obj, KEY_PR_MAX_SIG, max_sig.unwrap_or(21.0)); } else { set_internal_field(obj, KEY_PR_USE_SIG, bool_value(false)); - let min_frac = get_option_number(options, "minimumFractionDigits").unwrap_or(0.0); - let max_frac = get_option_number(options, "maximumFractionDigits") - .unwrap_or_else(|| min_frac.max(3.0)); + // Reuse the values read above (in spec order) — re-reading would + // double-invoke the option getters and break read-order parity. + let min_frac = min_frac_read.unwrap_or(0.0); + let max_frac = max_frac_read.unwrap_or_else(|| min_frac.max(3.0)); set_internal_field(obj, KEY_PR_MIN_FRAC, min_frac); set_internal_field(obj, KEY_PR_MAX_FRAC, max_frac); } @@ -1288,17 +1618,38 @@ extern "C" fn plural_rules_constructor_thunk(closure: *const ClosureHeader, rest ) } -fn supported_locales_array(locales: f64) -> f64 { - let locales = locales_from_value(locales); - let mut arr = js_array_alloc(locales.len() as u32); - for locale in locales { - arr = js_array_push_f64(arr, string_value(&locale)); +fn supported_locales_array(locales: f64, options: f64) -> f64 { + // `supportedLocalesOf(locales, options)`: + // 1. requestedLocales = ? CanonicalizeLocaleList(locales) ← runs FIRST, + // so a malformed locale errors before `options` is touched. + // 2. SupportedLocales(..., options): when `options` is not undefined, + // `? ToObject(options)` (null → TypeError) then + // `? GetOption(options, "localeMatcher", …)` — an invalid localeMatcher + // is a RangeError even though the matcher choice does not affect Perry's + // lookup result. + let requested = locales_from_value(locales); + if !JSValue::from_bits(options.to_bits()).is_undefined() { + let options = coerce_options_reject_null(options); + let _ = enum_option_strict( + options, + "localeMatcher", + &["lookup", "best fit"], + "best fit", + ); + } + // BestAvailableLocale-filter the canonicalized request list: drop tags whose + // primary language Perry can't service (e.g. `zxx`), keeping order + dedup. + let mut arr = js_array_alloc(0); + for locale in requested { + if is_available_locale(&locale) { + arr = js_array_push_f64(arr, string_value(&locale)); + } } js_nanbox_pointer(arr as i64) } -extern "C" fn supported_locales_of_thunk(_closure: *const ClosureHeader, locales: f64) -> f64 { - supported_locales_array(locales) +extern "C" fn supported_locales_of_thunk(_closure: *const ClosureHeader, rest: f64) -> f64 { + supported_locales_array(rest_arg(rest, 0), rest_arg(rest, 1)) } fn install_function( @@ -1432,13 +1783,16 @@ fn install_constructor( PropertyAttrs::new(false, false, false), ); + // `supportedLocalesOf(locales, options)` — `.length` is 1, but it reads a + // second `options` argument, so register it rest-style (all args collected) + // and pull both positionally. let supported = install_function( ctor as *mut ObjectHeader, "supportedLocalesOf", supported_locales_of_thunk as *const u8, + 0, 1, - 1, - false, + true, ); crate::closure::closure_set_dynamic_prop(ctor as usize, "supportedLocalesOf", supported); diff --git a/crates/perry-runtime/src/intl/date_collator.rs b/crates/perry-runtime/src/intl/date_collator.rs index e658321f0..ec3d156f2 100644 --- a/crates/perry-runtime/src/intl/date_collator.rs +++ b/crates/perry-runtime/src/intl/date_collator.rs @@ -342,7 +342,27 @@ pub(crate) fn swedish_collation_key(s: &str) -> Vec { .collect() } +/// Normalize to NFD so canonically-equivalent strings (e.g. `"ö"` precomposed +/// vs. `"ö"` decomposed) collate equal — the ECMA-402 requirement that +/// `Collator.compare` treats canonical equivalents as 0 (canonically-equivalent +/// -strings.js). Without `string-normalize` this is an identity passthrough, so +/// the precomposed/decomposed pair still compares unequal (best effort). +#[cfg(feature = "string-normalize")] +fn collation_normalize(s: &str) -> String { + use unicode_normalization::UnicodeNormalization; + // NFC (composition), not NFD: it makes canonical equivalents equal while + // keeping precomposed `å/ä/ö` intact for the Swedish fast path below. + s.nfc().collect() +} +#[cfg(not(feature = "string-normalize"))] +fn collation_normalize(s: &str) -> String { + s.to_string() +} + pub(crate) fn compare_strings(locale: &str, left: &str, right: &str) -> f64 { + let left = collation_normalize(left); + let right = collation_normalize(right); + let (left, right) = (left.as_str(), right.as_str()); let ordering = if locale == "sv" || locale.starts_with("sv-") { swedish_collation_key(left).cmp(&swedish_collation_key(right)) } else { @@ -373,9 +393,39 @@ pub(crate) extern "C" fn collator_bound_compare_thunk( collator_compare_object(obj, left, right) } +/// Strip the code points a UCA `ignorePunctuation` collator treats as ignorable +/// — whitespace and punctuation — so e.g. `compare("", " ")` and +/// `compare("", "*")` are 0 (compare/ignorePunctuation.js). +fn strip_ignorable_punctuation(s: &str) -> String { + s.chars() + .filter(|c| !c.is_whitespace() && !is_punctuation(*c)) + .collect() +} + +fn is_punctuation(c: char) -> bool { + // ASCII punctuation plus an explicit set of Unicode punctuation code points, + // deliberately NOT whole Latin-1 ranges — those contain letters/numbers + // (`ª` U+00AA, `µ` U+00B5, `º` U+00BA, the `¹²³` superscripts, `¼½¾` + // fractions) that must not be stripped or distinct strings would compare + // equal. The General Punctuation block (U+2000–U+206F) and CJK punctuation + // (U+3000–U+303F) are all punctuation/spaces and are safe as ranges. + c.is_ascii_punctuation() + || matches!(c, + '\u{00A1}' | '\u{00A7}' | '\u{00AB}' | '\u{00B6}' | '\u{00B7}' + | '\u{00BB}' | '\u{00BF}' + | '\u{2000}'..='\u{206F}' + | '\u{3000}'..='\u{303F}') +} + pub(crate) fn collator_compare_object(obj: *const ObjectHeader, left: f64, right: f64) -> f64 { let locale = get_string_field(obj, KEY_LOCALE).unwrap_or_else(|| "en-US".to_string()); - compare_strings(&locale, &value_to_string(left), &value_to_string(right)) + let ignore_punct = get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE; + let (mut l, mut r) = (value_to_string(left), value_to_string(right)); + if ignore_punct { + l = strip_ignorable_punctuation(&l); + r = strip_ignorable_punctuation(&r); + } + compare_strings(&locale, &l, &r) } pub(crate) extern "C" fn collator_resolved_options_thunk(_closure: *const ClosureHeader) -> f64 { @@ -391,16 +441,49 @@ pub(crate) extern "C" fn collator_bound_resolved_options_thunk( } pub(crate) fn collator_resolved_options_object(obj: *const ObjectHeader) -> f64 { - let out = js_object_alloc(0, 6); + let out = js_object_alloc(0, 7); + // Property insertion order matches ECMA-402 (resolvedOptions/order.js): + // locale, usage, sensitivity, ignorePunctuation, collation, numeric, caseFirst. set_field( out, "locale", string_value(&get_string_field(obj, KEY_LOCALE).unwrap_or_else(|| "en-US".to_string())), ); - set_field(out, "usage", string_value("sort")); - set_field(out, "sensitivity", string_value("variant")); - set_field(out, "ignorePunctuation", bool_value(false)); - set_field(out, "numeric", bool_value(false)); - set_field(out, "caseFirst", string_value("false")); + set_field( + out, + "usage", + string_value(&get_string_field(obj, KEY_COL_USAGE).unwrap_or_else(|| "sort".to_string())), + ); + set_field( + out, + "sensitivity", + string_value( + &get_string_field(obj, KEY_COL_SENSITIVITY).unwrap_or_else(|| "variant".to_string()), + ), + ); + set_field( + out, + "ignorePunctuation", + bool_value(get_field(obj, KEY_COL_IGNORE_PUNCT).to_bits() == crate::value::TAG_TRUE), + ); + set_field( + out, + "collation", + string_value( + &get_string_field(obj, KEY_COL_COLLATION).unwrap_or_else(|| "default".to_string()), + ), + ); + set_field( + out, + "numeric", + bool_value(get_field(obj, KEY_COL_NUMERIC).to_bits() == crate::value::TAG_TRUE), + ); + set_field( + out, + "caseFirst", + string_value( + &get_string_field(obj, KEY_COL_CASE_FIRST).unwrap_or_else(|| "false".to_string()), + ), + ); js_nanbox_pointer(out as i64) } diff --git a/crates/perry-runtime/src/intl/list_relative_plural.rs b/crates/perry-runtime/src/intl/list_relative_plural.rs index 1878d41e9..ed354223b 100644 --- a/crates/perry-runtime/src/intl/list_relative_plural.rs +++ b/crates/perry-runtime/src/intl/list_relative_plural.rs @@ -269,8 +269,9 @@ const RTF_SINGULAR_UNITS: &[&str] = &[ /// Normalize a RelativeTimeFormat unit argument (singular or plural) to its /// singular sanctioned form, or `None` if unrecognized (caller raises RangeError). pub(crate) fn rtf_singular_unit(unit: &str) -> Option<&'static str> { - let lower = unit.to_ascii_lowercase(); - let candidate = lower.strip_suffix('s').unwrap_or(&lower); + // The sanctioned units are case-sensitive (ECMA-402 IsSanctionedSingularUnit): + // `"second"`/`"seconds"` are accepted, `"SECOND"` is not (format/unit-invalid.js). + let candidate = unit.strip_suffix('s').unwrap_or(unit); RTF_SINGULAR_UNITS.iter().copied().find(|u| *u == candidate) } @@ -300,18 +301,61 @@ pub(crate) fn rtf_parts(value: f64, unit: &str) -> Vec<(&'static str, String)> { parts } -pub(crate) fn rtf_instance_parts(value: f64, unit_arg: f64) -> Vec<(&'static str, String)> { - let number = JSValue::from_bits(value.to_bits()).to_number(); +/// `ToNumber(value)` that rejects BigInt with a TypeError, matching the +/// ECMA-262 abstract operation. `js_number_coerce` alone converts `1n` → `1` +/// (for `Number(1n)`), but `Intl` `format`/`select*` go through ToNumber, so +/// `format(1n, "day")` must throw. A Symbol still throws inside `js_number_coerce`, +/// and an object's `valueOf` is honoured there. +pub(crate) fn to_number_reject_bigint(value: f64) -> f64 { + if JSValue::from_bits(value.to_bits()).is_bigint() { + throw_type_error("Cannot convert a BigInt value to a number"); + } + crate::builtins::js_number_coerce(value) +} + +/// Shared steps of `format`/`formatToParts`: `value = ? ToNumber(value)` (a +/// Symbol or BigInt throws TypeError; an object's `valueOf` is honoured), then +/// `unit = ? ToString(unit)`, then the RangeError guards for a non-finite value +/// or an unsanctioned unit. Returns the rendered parts together with the +/// resolved singular `unit` (the `[[Unit]]` field formatToParts attaches). +pub(crate) fn rtf_instance_parts_and_unit( + value: f64, + unit_arg: f64, +) -> (Vec<(&'static str, String)>, &'static str) { + // ToNumber: a Symbol/BigInt value throws TypeError *before* the finite-ness + // RangeError (format/value-symbol.js); an object's valueOf is invoked. + let number = to_number_reject_bigint(value); + let unit_str = value_to_string(unit_arg); if !number.is_finite() { throw_range_error("Value need to be finite number for Intl.RelativeTimeFormat.format()"); } - let unit_str = value_to_string(unit_arg); let Some(unit) = rtf_singular_unit(&unit_str) else { throw_range_error(&format!( "Value {unit_str} out of range for Intl.RelativeTimeFormat.format() unit" )); }; - rtf_parts(number, unit) + (rtf_parts(number, unit), unit) +} + +pub(crate) fn rtf_instance_parts(value: f64, unit_arg: f64) -> Vec<(&'static str, String)> { + rtf_instance_parts_and_unit(value, unit_arg).0 +} + +/// Build the `formatToParts` array, attaching the `[[Unit]]` field to every part +/// derived from the formatted number (i.e. every non-`"literal"` part) per +/// FormatRelativeTimeToParts (formatToParts/result-type.js). +fn rtf_parts_to_js_array(parts: &[(&'static str, String)], unit: &str) -> f64 { + let mut arr = js_array_alloc(parts.len() as u32); + for (ty, val) in parts { + let obj = js_object_alloc(0, 3); + set_field(obj, "type", string_value(ty)); + set_field(obj, "value", string_value(val)); + if *ty != "literal" { + set_field(obj, "unit", string_value(unit)); + } + arr = js_array_push_f64(arr, js_nanbox_pointer(obj as i64)); + } + js_nanbox_pointer(arr as i64) } pub(crate) extern "C" fn rtf_format_thunk( @@ -348,7 +392,8 @@ pub(crate) extern "C" fn rtf_to_parts_thunk( unit: f64, ) -> f64 { let _obj = this_intl_object("formatToParts", KIND_RELATIVE_TIME); - parts_to_js_array(&rtf_instance_parts(value, unit)) + let (parts, unit) = rtf_instance_parts_and_unit(value, unit); + rtf_parts_to_js_array(&parts, unit) } pub(crate) extern "C" fn rtf_bound_to_parts_thunk( @@ -357,7 +402,8 @@ pub(crate) extern "C" fn rtf_bound_to_parts_thunk( unit: f64, ) -> f64 { let _obj = captured_intl_object(closure, "formatToParts", KIND_RELATIVE_TIME); - parts_to_js_array(&rtf_instance_parts(value, unit)) + let (parts, unit) = rtf_instance_parts_and_unit(value, unit); + rtf_parts_to_js_array(&parts, unit) } pub(crate) fn rtf_resolved_options_object(obj: *const ObjectHeader) -> f64 { @@ -469,8 +515,17 @@ pub(crate) extern "C" fn plural_rules_bound_select_range_thunk( } pub(crate) fn plural_select_range(start: f64, end: f64) -> f64 { - let s = JSValue::from_bits(start.to_bits()).to_number(); - let e = JSValue::from_bits(end.to_bits()).to_number(); + // PluralRules.prototype.selectRange(start, end): a `undefined` endpoint is a + // TypeError (step 3), evaluated *before* the `? ToNumber` coercions — and + // ToNumber itself throws TypeError for a Symbol (selectRange/ + // undefined-arguments-throws.js, argument-tonumber-throws.js). + if JSValue::from_bits(start.to_bits()).is_undefined() + || JSValue::from_bits(end.to_bits()).is_undefined() + { + throw_type_error("Intl.PluralRules.prototype.selectRange: start and end must be defined"); + } + let s = to_number_reject_bigint(start); + let e = to_number_reject_bigint(end); if s.is_nan() || e.is_nan() { throw_range_error("Invalid values for Intl.PluralRules.selectRange()"); } @@ -491,7 +546,19 @@ pub(crate) fn plural_rules_resolved_options_object(obj: *const ObjectHeader) -> "type", string_value(if is_ordinal { "ordinal" } else { "cardinal" }), ); - set_field(out, "notation", string_value("standard")); + let notation = get_string_field(obj, KEY_PR_NOTATION).unwrap_or_else(|| "standard".to_string()); + set_field(out, "notation", string_value(¬ation)); + // `compactDisplay` surfaces only when notation is "compact". + if notation == "compact" { + set_field( + out, + "compactDisplay", + string_value( + &get_string_field(obj, KEY_PR_COMPACT_DISPLAY) + .unwrap_or_else(|| "short".to_string()), + ), + ); + } set_field( out, "minimumIntegerDigits",