sort: improve performance for very long lines (#12138)

sylvestre · sylvestre · commit dec19b3e33c3 · 2026-05-07T22:19:13.000+02:00
diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs
@@ -57,6 +57,10 @@ pub struct LineData<'a> {
     pub collation_key_buffer: Vec<u8>,
     /// End offsets into `collation_key_buffer` for each line's sort key.
     pub collation_key_ends: Vec<usize>,
+    /// Tracks whether each line's sort key was computed from a truncated prefix.
+    /// When `true`, prefix sort keys that compare equal must fall back to full
+    /// locale comparison.
+    pub collation_key_truncated: Vec<bool>,
 }
 
 impl LineData<'_> {
@@ -83,6 +87,7 @@ impl Chunk {
             contents.line_data.line_num_floats.clear();
             contents.line_data.collation_key_buffer.clear();
             contents.line_data.collation_key_ends.clear();
+            contents.line_data.collation_key_truncated.clear();
             contents.token_buffer.clear();
             let lines = unsafe {
                 // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
@@ -108,6 +113,9 @@ impl Chunk {
                 line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
                 collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
                 collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
+                collation_key_truncated: std::mem::take(
+                    &mut contents.line_data.collation_key_truncated,
+                ),
                 token_buffer: std::mem::take(&mut contents.token_buffer),
                 line_count_hint: contents.line_count_hint,
                 // buffer is set below after we consume `self`
@@ -135,6 +143,7 @@ pub struct RecycledChunk {
     line_num_floats: Vec<Option<f64>>,
     collation_key_buffer: Vec<u8>,
     collation_key_ends: Vec<usize>,
+    collation_key_truncated: Vec<bool>,
     token_buffer: Vec<Range<usize>>,
     line_count_hint: usize,
     buffer: Vec<u8>,
@@ -150,6 +159,7 @@ impl RecycledChunk {
             line_num_floats: Vec::new(),
             collation_key_buffer: Vec::new(),
             collation_key_ends: Vec::new(),
+            collation_key_truncated: Vec::new(),
             token_buffer: Vec::new(),
             line_count_hint: 0,
             buffer: vec![0; capacity],
@@ -197,6 +207,7 @@ pub fn read<T: Read>(
         line_num_floats,
         collation_key_buffer,
         collation_key_ends,
+        collation_key_truncated,
         mut token_buffer,
         mut line_count_hint,
         mut buffer,
@@ -237,6 +248,7 @@ pub fn read<T: Read>(
                 line_num_floats,
                 collation_key_buffer,
                 collation_key_ends,
+                collation_key_truncated,
             };
             parse_lines(
                 read,
diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs
@@ -643,10 +643,11 @@ impl<'a> Line<'a> {
     ) -> Self {
         #[cfg(feature = "i18n-collator")]
         if settings.precomputed.fast_locale_collation {
-            compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
+            let truncated = compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
             line_data
                 .collation_key_ends
                 .push(line_data.collation_key_buffer.len());
+            line_data.collation_key_truncated.push(truncated);
             return Self { line, index };
         }
 
@@ -2656,11 +2657,20 @@ fn compare_by<'a>(
         let a_key = a_line_data.collation_key(a.index);
         let b_key = b_line_data.collation_key(b.index);
         let mut cmp = a_key.cmp(b_key);
-        // If collation keys are equal, fall back to lexicographic comparison
-        // This can be the case for inputs like `01` and `0_1`, which have equal keys
+        // If collation keys are equal, we need to distinguish two cases:
         if cmp == Ordering::Equal {
-            // Reversing the order to match sort's sorting behaviour
-            cmp = b.line.cmp(a.line);
+            let a_truncated = a_line_data.collation_key_truncated[a.index];
+            let b_truncated = b_line_data.collation_key_truncated[b.index];
+            if a_truncated || b_truncated {
+                // Prefix sort keys matched but at least one line was truncated.
+                // Fall back to full locale comparison for correctness.
+                cmp = locale_cmp(a.line, b.line);
+            }
+            // If still equal (or neither was truncated), use reverse lexicographic
+            // tiebreak to match GNU sort's behaviour for inputs like `01` vs `0_1`.
+            if cmp == Ordering::Equal {
+                cmp = b.line.cmp(a.line);
+            }
         }
         return if global_settings.reverse {
             cmp.reverse()
diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs
@@ -74,15 +74,38 @@ pub fn init_locale_collation() -> bool {
     try_init_collator(opts)
 }
 
-/// Compute the ICU collation sort key for the given input bytes and append it to `buf`.
-/// This allows pre-computing sort keys once per line, then comparing them with simple
-/// byte comparison during sorting (much faster than calling `compare_utf8` per comparison).
-pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) {
+/// Cap on input bytes used to compute a sort key. Callers must fall back to
+/// `locale_cmp` when prefix keys tie. 8 KiB bounds key cost on multi-MB lines
+/// without hitting the fallback for realistic inputs — see issue #12138
+/// (unbounded path was ~40× slower than GNU `sort`).
+const SORT_KEY_PREFIX_LIMIT: usize = 8 * 1024;
+
+/// Append the ICU collation sort key for `input` to `buf`, using at most
+/// `SORT_KEY_PREFIX_LIMIT` bytes. Returns `true` if the input was truncated;
+/// the caller must then fall back to `locale_cmp` on tie.
+pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) -> bool {
     let c = COLLATOR
         .get()
         .expect("compute_sort_key_utf8 called before collator initialization");
-    c.write_sort_key_utf8_to(input, buf)
+    let truncated = input.len() > SORT_KEY_PREFIX_LIMIT;
+    let effective_input = if truncated {
+        let mut end = SORT_KEY_PREFIX_LIMIT;
+        while end > 0 && !is_utf8_char_boundary(input[end]) {
+            end -= 1;
+        }
+        &input[..end]
+    } else {
+        input
+    };
+    c.write_sort_key_utf8_to(effective_input, buf)
         .expect("ICU write_sort_key_utf8_to failed");
+    truncated
+}
+
+#[inline]
+fn is_utf8_char_boundary(b: u8) -> bool {
+    // ASCII (0xxxxxxx) or UTF-8 leading byte (11xxxxxx).
+    (b as i8) >= -0x40
 }
 
 /// Compare both strings with regard to the current locale.
diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs
@@ -2981,4 +2981,52 @@ fn test_consistent_sorting_with_i18n_collate() {
         .stdout_is(expected_output);
 }
 
+#[test]
+#[cfg(unix)]
+fn test_locale_utf8_long_lines_differ_after_prefix_limit() {
+    // Regression test for #12138: lines sharing a prefix longer than the
+    // 8 KiB sort-key limit must fall back to full locale comparison.
+    let locale = "en_US.UTF-8";
+    if !is_locale_available(locale) {
+        return;
+    }
+    let prefix = "x".repeat(16 * 1024);
+    let line_a = format!("{prefix}a\n");
+    let line_b = format!("{prefix}b\n");
+    let input = format!("{line_b}{line_a}");
+    let expected = format!("{line_a}{line_b}");
+    new_ucmd!()
+        .env("LC_ALL", locale)
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(expected);
+}
+
+#[test]
+#[cfg(unix)]
+fn test_locale_utf8_truncation_at_multibyte_boundary() {
+    // Construct lines whose byte length is just over the 8 KiB sort-key
+    // prefix limit, with a multi-byte UTF-8 character (é = 0xC3 0xA9)
+    // straddling that boundary. The truncation logic must back off to a
+    // valid char boundary and not split the multi-byte sequence; the
+    // fallback path must then order the lines correctly.
+    let locale = "en_US.UTF-8";
+    if !is_locale_available(locale) {
+        return;
+    }
+    // Pad to one byte before the 8 KiB limit, then place "éa" / "éb" so
+    // 'é' begins at byte 8191 (straddling 8192) and the differing ASCII
+    // byte ('a' vs 'b') sits past the limit.
+    let pad = "x".repeat(8 * 1024 - 1);
+    let line_a = format!("{pad}éa\n");
+    let line_b = format!("{pad}éb\n");
+    let input = format!("{line_b}{line_a}");
+    let expected = format!("{line_a}{line_b}");
+    new_ucmd!()
+        .env("LC_ALL", locale)
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(expected);
+}
+
 /* spell-checker: enable */