Skip to content

Commit dec19b3

Browse files
committed
sort: improve performance for very long lines (#12138)
1 parent 485b156 commit dec19b3

4 files changed

Lines changed: 103 additions & 10 deletions

File tree

src/uu/sort/src/chunks.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ pub struct LineData<'a> {
5757
pub collation_key_buffer: Vec<u8>,
5858
/// End offsets into `collation_key_buffer` for each line's sort key.
5959
pub collation_key_ends: Vec<usize>,
60+
/// Tracks whether each line's sort key was computed from a truncated prefix.
61+
/// When `true`, prefix sort keys that compare equal must fall back to full
62+
/// locale comparison.
63+
pub collation_key_truncated: Vec<bool>,
6064
}
6165

6266
impl LineData<'_> {
@@ -83,6 +87,7 @@ impl Chunk {
8387
contents.line_data.line_num_floats.clear();
8488
contents.line_data.collation_key_buffer.clear();
8589
contents.line_data.collation_key_ends.clear();
90+
contents.line_data.collation_key_truncated.clear();
8691
contents.token_buffer.clear();
8792
let lines = unsafe {
8893
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
@@ -108,6 +113,9 @@ impl Chunk {
108113
line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
109114
collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
110115
collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
116+
collation_key_truncated: std::mem::take(
117+
&mut contents.line_data.collation_key_truncated,
118+
),
111119
token_buffer: std::mem::take(&mut contents.token_buffer),
112120
line_count_hint: contents.line_count_hint,
113121
// buffer is set below after we consume `self`
@@ -135,6 +143,7 @@ pub struct RecycledChunk {
135143
line_num_floats: Vec<Option<f64>>,
136144
collation_key_buffer: Vec<u8>,
137145
collation_key_ends: Vec<usize>,
146+
collation_key_truncated: Vec<bool>,
138147
token_buffer: Vec<Range<usize>>,
139148
line_count_hint: usize,
140149
buffer: Vec<u8>,
@@ -150,6 +159,7 @@ impl RecycledChunk {
150159
line_num_floats: Vec::new(),
151160
collation_key_buffer: Vec::new(),
152161
collation_key_ends: Vec::new(),
162+
collation_key_truncated: Vec::new(),
153163
token_buffer: Vec::new(),
154164
line_count_hint: 0,
155165
buffer: vec![0; capacity],
@@ -197,6 +207,7 @@ pub fn read<T: Read>(
197207
line_num_floats,
198208
collation_key_buffer,
199209
collation_key_ends,
210+
collation_key_truncated,
200211
mut token_buffer,
201212
mut line_count_hint,
202213
mut buffer,
@@ -237,6 +248,7 @@ pub fn read<T: Read>(
237248
line_num_floats,
238249
collation_key_buffer,
239250
collation_key_ends,
251+
collation_key_truncated,
240252
};
241253
parse_lines(
242254
read,

src/uu/sort/src/sort.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -643,10 +643,11 @@ impl<'a> Line<'a> {
643643
) -> Self {
644644
#[cfg(feature = "i18n-collator")]
645645
if settings.precomputed.fast_locale_collation {
646-
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
646+
let truncated = compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
647647
line_data
648648
.collation_key_ends
649649
.push(line_data.collation_key_buffer.len());
650+
line_data.collation_key_truncated.push(truncated);
650651
return Self { line, index };
651652
}
652653

@@ -2656,11 +2657,20 @@ fn compare_by<'a>(
26562657
let a_key = a_line_data.collation_key(a.index);
26572658
let b_key = b_line_data.collation_key(b.index);
26582659
let mut cmp = a_key.cmp(b_key);
2659-
// If collation keys are equal, fall back to lexicographic comparison
2660-
// This can be the case for inputs like `01` and `0_1`, which have equal keys
2660+
// If collation keys are equal, we need to distinguish two cases:
26612661
if cmp == Ordering::Equal {
2662-
// Reversing the order to match sort's sorting behaviour
2663-
cmp = b.line.cmp(a.line);
2662+
let a_truncated = a_line_data.collation_key_truncated[a.index];
2663+
let b_truncated = b_line_data.collation_key_truncated[b.index];
2664+
if a_truncated || b_truncated {
2665+
// Prefix sort keys matched but at least one line was truncated.
2666+
// Fall back to full locale comparison for correctness.
2667+
cmp = locale_cmp(a.line, b.line);
2668+
}
2669+
// If still equal (or neither was truncated), use reverse lexicographic
2670+
// tiebreak to match GNU sort's behaviour for inputs like `01` vs `0_1`.
2671+
if cmp == Ordering::Equal {
2672+
cmp = b.line.cmp(a.line);
2673+
}
26642674
}
26652675
return if global_settings.reverse {
26662676
cmp.reverse()

src/uucore/src/lib/features/i18n/collator.rs

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,38 @@ pub fn init_locale_collation() -> bool {
7474
try_init_collator(opts)
7575
}
7676

77-
/// Compute the ICU collation sort key for the given input bytes and append it to `buf`.
78-
/// This allows pre-computing sort keys once per line, then comparing them with simple
79-
/// byte comparison during sorting (much faster than calling `compare_utf8` per comparison).
80-
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) {
77+
/// Cap on input bytes used to compute a sort key. Callers must fall back to
78+
/// `locale_cmp` when prefix keys tie. 8 KiB bounds key cost on multi-MB lines
79+
/// without hitting the fallback for realistic inputs — see issue #12138
80+
/// (unbounded path was ~40× slower than GNU `sort`).
81+
const SORT_KEY_PREFIX_LIMIT: usize = 8 * 1024;
82+
83+
/// Append the ICU collation sort key for `input` to `buf`, using at most
84+
/// `SORT_KEY_PREFIX_LIMIT` bytes. Returns `true` if the input was truncated;
85+
/// the caller must then fall back to `locale_cmp` on tie.
86+
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) -> bool {
8187
let c = COLLATOR
8288
.get()
8389
.expect("compute_sort_key_utf8 called before collator initialization");
84-
c.write_sort_key_utf8_to(input, buf)
90+
let truncated = input.len() > SORT_KEY_PREFIX_LIMIT;
91+
let effective_input = if truncated {
92+
let mut end = SORT_KEY_PREFIX_LIMIT;
93+
while end > 0 && !is_utf8_char_boundary(input[end]) {
94+
end -= 1;
95+
}
96+
&input[..end]
97+
} else {
98+
input
99+
};
100+
c.write_sort_key_utf8_to(effective_input, buf)
85101
.expect("ICU write_sort_key_utf8_to failed");
102+
truncated
103+
}
104+
105+
#[inline]
106+
fn is_utf8_char_boundary(b: u8) -> bool {
107+
// ASCII (0xxxxxxx) or UTF-8 leading byte (11xxxxxx).
108+
(b as i8) >= -0x40
86109
}
87110

88111
/// Compare both strings with regard to the current locale.

tests/by-util/test_sort.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2981,4 +2981,52 @@ fn test_consistent_sorting_with_i18n_collate() {
29812981
.stdout_is(expected_output);
29822982
}
29832983

2984+
#[test]
2985+
#[cfg(unix)]
2986+
fn test_locale_utf8_long_lines_differ_after_prefix_limit() {
2987+
// Regression test for #12138: lines sharing a prefix longer than the
2988+
// 8 KiB sort-key limit must fall back to full locale comparison.
2989+
let locale = "en_US.UTF-8";
2990+
if !is_locale_available(locale) {
2991+
return;
2992+
}
2993+
let prefix = "x".repeat(16 * 1024);
2994+
let line_a = format!("{prefix}a\n");
2995+
let line_b = format!("{prefix}b\n");
2996+
let input = format!("{line_b}{line_a}");
2997+
let expected = format!("{line_a}{line_b}");
2998+
new_ucmd!()
2999+
.env("LC_ALL", locale)
3000+
.pipe_in(input)
3001+
.succeeds()
3002+
.stdout_is(expected);
3003+
}
3004+
3005+
#[test]
3006+
#[cfg(unix)]
3007+
fn test_locale_utf8_truncation_at_multibyte_boundary() {
3008+
// Construct lines whose byte length is just over the 8 KiB sort-key
3009+
// prefix limit, with a multi-byte UTF-8 character (é = 0xC3 0xA9)
3010+
// straddling that boundary. The truncation logic must back off to a
3011+
// valid char boundary and not split the multi-byte sequence; the
3012+
// fallback path must then order the lines correctly.
3013+
let locale = "en_US.UTF-8";
3014+
if !is_locale_available(locale) {
3015+
return;
3016+
}
3017+
// Pad to one byte before the 8 KiB limit, then place "éa" / "éb" so
3018+
// 'é' begins at byte 8191 (straddling 8192) and the differing ASCII
3019+
// byte ('a' vs 'b') sits past the limit.
3020+
let pad = "x".repeat(8 * 1024 - 1);
3021+
let line_a = format!("{pad}éa\n");
3022+
let line_b = format!("{pad}éb\n");
3023+
let input = format!("{line_b}{line_a}");
3024+
let expected = format!("{line_a}{line_b}");
3025+
new_ucmd!()
3026+
.env("LC_ALL", locale)
3027+
.pipe_in(input)
3028+
.succeeds()
3029+
.stdout_is(expected);
3030+
}
3031+
29843032
/* spell-checker: enable */

0 commit comments

Comments
 (0)