@@ -92,6 +92,68 @@ fn sort_unique_utf8_locale(bencher: Bencher) {
9292 } ) ;
9393}
9494
95+ /// Benchmark sorting very long lines (single repeated character per line) with UTF-8 locale.
96+ /// This reproduces the pathological case from issue #12138 where computing full collation
97+ /// sort keys for multi-megabyte lines caused a 40x slowdown vs GNU sort.
98+ /// We use 1 MB lines (26 lines, one per letter) to keep the benchmark fast while still
99+ /// exercising the prefix-based sort key optimization.
100+ #[ divan:: bench]
101+ fn sort_very_long_lines_utf8_locale ( bencher : Bencher ) {
102+ let mut data = Vec :: new ( ) ;
103+ // Create 26 lines of 1 MB each, each line is a single repeated letter
104+ let letters: Vec < u8 > = ( b'a' ..=b'z' ) . collect ( ) ;
105+ for & ch in & letters {
106+ data. extend ( std:: iter:: repeat_n ( ch, 1_000_000 ) ) ;
107+ data. push ( b'\n' ) ;
108+ }
109+ let file_path = setup_test_file ( & data) ;
110+ let output_file = NamedTempFile :: new ( ) . unwrap ( ) ;
111+ let output_path = output_file. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
112+
113+ let args = [
114+ "--parallel" ,
115+ "1" ,
116+ "-o" ,
117+ & output_path,
118+ file_path. to_str ( ) . unwrap ( ) ,
119+ ] ;
120+ // Warm up
121+ black_box ( run_util_function ( uumain, & args) ) ;
122+ bencher. bench ( || {
123+ black_box ( run_util_function ( uumain, & args) ) ;
124+ } ) ;
125+ }
126+
127+ /// Benchmark sorting lines that share a long common prefix but differ after 8 KB,
128+ /// exercising the fallback from prefix sort keys to full locale comparison.
129+ #[ divan:: bench]
130+ fn sort_long_common_prefix_utf8_locale ( bencher : Bencher ) {
131+ let mut data = Vec :: new ( ) ;
132+ let prefix_len = 16 * 1024 ; // 16 KB common prefix (exceeds the 8 KB sort key limit)
133+ let prefix: Vec < u8 > = std:: iter:: repeat_n ( b'x' , prefix_len) . collect ( ) ;
134+ // 26 lines that share the prefix but differ in the suffix
135+ for ch in b'a' ..=b'z' {
136+ data. extend_from_slice ( & prefix) ;
137+ data. extend ( std:: iter:: repeat_n ( ch, 100 ) ) ;
138+ data. push ( b'\n' ) ;
139+ }
140+ let file_path = setup_test_file ( & data) ;
141+ let output_file = NamedTempFile :: new ( ) . unwrap ( ) ;
142+ let output_path = output_file. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
143+
144+ let args = [
145+ "--parallel" ,
146+ "1" ,
147+ "-o" ,
148+ & output_path,
149+ file_path. to_str ( ) . unwrap ( ) ,
150+ ] ;
151+ black_box ( run_util_function ( uumain, & args) ) ;
152+ bencher. bench ( || {
153+ black_box ( run_util_function ( uumain, & args) ) ;
154+ } ) ;
155+ }
156+
95157fn main ( ) {
96158 // Set UTF-8 locale BEFORE any benchmarks run.
97159 // This must happen before divan::main() because the locale is cached
0 commit comments