Skip to content

Commit 485b156

Browse files
authored
sort: add benchmark for locale UTF-8 sorting (#12185)
1 parent 912471d commit 485b156

1 file changed

Lines changed: 62 additions & 0 deletions

File tree

src/uu/sort/benches/sort_locale_utf8_bench.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,68 @@ fn sort_unique_utf8_locale(bencher: Bencher) {
9292
});
9393
}
9494

95+
/// Benchmark sorting very long lines (single repeated character per line) with UTF-8 locale.
96+
/// This reproduces the pathological case from issue #12138 where computing full collation
97+
/// sort keys for multi-megabyte lines caused a 40x slowdown vs GNU sort.
98+
/// We use 1 MB lines (26 lines, one per letter) to keep the benchmark fast while still
99+
/// exercising the prefix-based sort key optimization.
100+
#[divan::bench]
101+
fn sort_very_long_lines_utf8_locale(bencher: Bencher) {
102+
let mut data = Vec::new();
103+
// Create 26 lines of 1 MB each, each line is a single repeated letter
104+
let letters: Vec<u8> = (b'a'..=b'z').collect();
105+
for &ch in &letters {
106+
data.extend(std::iter::repeat_n(ch, 1_000_000));
107+
data.push(b'\n');
108+
}
109+
let file_path = setup_test_file(&data);
110+
let output_file = NamedTempFile::new().unwrap();
111+
let output_path = output_file.path().to_str().unwrap().to_string();
112+
113+
let args = [
114+
"--parallel",
115+
"1",
116+
"-o",
117+
&output_path,
118+
file_path.to_str().unwrap(),
119+
];
120+
// Warm up
121+
black_box(run_util_function(uumain, &args));
122+
bencher.bench(|| {
123+
black_box(run_util_function(uumain, &args));
124+
});
125+
}
126+
127+
/// Benchmark sorting lines that share a long common prefix but differ after 8 KB,
128+
/// exercising the fallback from prefix sort keys to full locale comparison.
129+
#[divan::bench]
130+
fn sort_long_common_prefix_utf8_locale(bencher: Bencher) {
131+
let mut data = Vec::new();
132+
let prefix_len = 16 * 1024; // 16 KB common prefix (exceeds the 8 KB sort key limit)
133+
let prefix: Vec<u8> = std::iter::repeat_n(b'x', prefix_len).collect();
134+
// 26 lines that share the prefix but differ in the suffix
135+
for ch in b'a'..=b'z' {
136+
data.extend_from_slice(&prefix);
137+
data.extend(std::iter::repeat_n(ch, 100));
138+
data.push(b'\n');
139+
}
140+
let file_path = setup_test_file(&data);
141+
let output_file = NamedTempFile::new().unwrap();
142+
let output_path = output_file.path().to_str().unwrap().to_string();
143+
144+
let args = [
145+
"--parallel",
146+
"1",
147+
"-o",
148+
&output_path,
149+
file_path.to_str().unwrap(),
150+
];
151+
black_box(run_util_function(uumain, &args));
152+
bencher.bench(|| {
153+
black_box(run_util_function(uumain, &args));
154+
});
155+
}
156+
95157
fn main() {
96158
// Set UTF-8 locale BEFORE any benchmarks run.
97159
// This must happen before divan::main() because the locale is cached

0 commit comments

Comments
 (0)