diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index c1b4c07084c..0c6ed317303 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -34,7 +34,7 @@ self_cell = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } unicode-width = { workspace = true } -uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } +uucore = { workspace = true, features = ["fs", "parser", "version-cmp", "i18n-collator"] } fluent = { workspace = true } nix = { workspace = true } diff --git a/src/uu/sort/locales/en-US.ftl b/src/uu/sort/locales/en-US.ftl index a13e932af22..13b77de67de 100644 --- a/src/uu/sort/locales/en-US.ftl +++ b/src/uu/sort/locales/en-US.ftl @@ -10,6 +10,11 @@ sort-after-help = The key format is FIELD[.CHAR][OPTIONS][,FIELD[.CHAR]][OPTIONS Valid options are: MbdfhnRrV. They override the global options for this key. + Locale-aware sorting: + The LC_ALL, LC_COLLATE, and LANG environment variables affect sorting order. + LC_ALL=C uses fast byte-wise comparison. Other locales use slower but correct Unicode collation. + For performance-critical scenarios with ASCII data, consider using LC_ALL=C. + # Error messages sort-open-failed = open failed: {$path}: {$error} sort-parse-key-error = failed to parse key {$key}: {$msg} diff --git a/src/uu/sort/locales/fr-FR.ftl b/src/uu/sort/locales/fr-FR.ftl index f434607d5d2..502a4d30c7f 100644 --- a/src/uu/sort/locales/fr-FR.ftl +++ b/src/uu/sort/locales/fr-FR.ftl @@ -10,6 +10,11 @@ sort-after-help = Le format de clé est CHAMP[.CAR][OPTIONS][,CHAMP[.CAR]][OPTIO Les options valides sont : MbdfhnRrV. Elles remplacent les options globales pour cette clé. + Tri selon la locale : + Les variables d'environnement LC_ALL, LC_COLLATE et LANG affectent l'ordre de tri. + LC_ALL=C utilise une comparaison rapide par octets. D'autres locales utilisent une collation Unicode plus lente mais correcte. + Pour des scénarios critiques en performance avec des données ASCII, considérez l'utilisation de LC_ALL=C. + # Messages d'erreur sort-open-failed = échec d'ouverture : {$path} : {$error} sort-parse-key-error = échec d'analyse de la clé {$key} : {$msg} diff --git a/src/uu/sort/src/custom_str_cmp.rs b/src/uu/sort/src/custom_str_cmp.rs index aa4f73ea7bb..7f6b6b7b136 100644 --- a/src/uu/sort/src/custom_str_cmp.rs +++ b/src/uu/sort/src/custom_str_cmp.rs @@ -8,6 +8,7 @@ //! The goal is to compare strings without transforming them first (i.e. not allocating new strings) use std::cmp::Ordering; +use uucore::i18n::collator::locale_cmp; fn filter_char(c: u8, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool { if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) { @@ -35,8 +36,8 @@ pub fn custom_str_cmp( ignore_case: bool, ) -> Ordering { if !(ignore_case || ignore_non_dictionary || ignore_non_printing) { - // There are no custom settings. Fall back to the default strcmp, which is faster. - return a.cmp(b); + // There are no custom settings. Fall back to locale-aware comparison. + return locale_cmp(a, b); } let mut a_chars = a .iter() diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index c9d1bac97aa..40b8a5835b9 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -47,6 +47,7 @@ use uucore::error::{FromIo, strip_errno}; use uucore::error::{UError, UResult, USimpleError, UUsageError}; use uucore::extendedbigdecimal::ExtendedBigDecimal; use uucore::format_usage; +use uucore::i18n::collator::CollatorOptions; use uucore::line_ending::LineEnding; use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; use uucore::parser::parse_size::{ParseSizeError, Parser}; @@ -1068,6 +1069,28 @@ fn default_merge_batch_size() -> usize { } } +/// Check if locale-aware collation will be needed based on sort settings and locale +fn will_need_locale_collation(settings: &GlobalSettings) -> bool { + // First check if we're using the C locale (DEFAULT_LOCALE), which doesn't need collator + let (locale, _) = uucore::i18n::get_collating_locale(); + if *locale == uucore::i18n::DEFAULT_LOCALE { + return false; + } + + // Check each selector to see if any would use locale comparison + for selector in &settings.selectors { + let key_settings = &selector.settings; + if key_settings.mode == SortMode::Default + && !key_settings.ignore_case + && !key_settings.ignore_non_printing + && !key_settings.dictionary_order + { + return true; + } + } + false +} + #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { @@ -1350,6 +1373,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { settings.init_precomputed(); + // Initialize locale-aware collator only if needed for string comparisons + if will_need_locale_collation(&settings) { + uucore::i18n::collator::try_init_collator(CollatorOptions::default()); + } + let result = exec(&mut files, &settings, output, &mut tmp_dir); // Wait here if `SIGINT` was received, // for signal handler to do its work and terminate the program. diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index d47f2df9835..1296b2c5ab1 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -20,7 +20,7 @@ pub enum UEncoding { Utf8, } -const DEFAULT_LOCALE: Locale = locale!("en-US-posix"); +pub const DEFAULT_LOCALE: Locale = locale!("en-US-posix"); /// Look at 3 environment variables in the following order /// @@ -64,7 +64,7 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) { } /// Get the collating locale from the environment -fn get_collating_locale() -> &'static (Locale, UEncoding) { +pub fn get_collating_locale() -> &'static (Locale, UEncoding) { static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); COLLATING_LOCALE.get_or_init(|| get_locale_from_env("LC_COLLATE")) diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 3a4cc1a86c5..d767660d3fb 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1906,6 +1906,26 @@ fn test_color_environment_variables() { } } +#[test] +fn test_locale_sorting_c() { + // Test LC_ALL=C locale sorting (should sort by byte values) + new_ucmd!() + .env("LC_ALL", "C") + .pipe_in("a\no\nu\nä\nö\nü\n") + .succeeds() + .stdout_is("a\no\nu\nä\nö\nü\n"); +} + +#[test] +fn test_locale_sorting_german() { + // Test LC_ALL=de_DE.utf-8 locale sorting (should respect German collation) + new_ucmd!() + .env("LC_ALL", "de_DE.utf-8") + .pipe_in("a\no\nu\nä\nö\nü\n") + .succeeds() + .stdout_is("a\nä\no\nö\nu\nü\n"); +} + #[test] fn test_start_buffer() { // Test that a file with the exact same size as the start buffer is handled correctly