1+ // This file is part of the uutils diffutils package.
2+ //
3+ // For the full copyright and license information, please view the LICENSE-*
4+ // files that was distributed with this source code.
5+
16//! Benches for all utils in diffutils.
2- const FILE_LINES : [ usize ; 3 ] = [ 10_000 , 100_000 , 500_000 ] ;
7+ //!
8+ //! There is a file generator included to create files of different sizes for comparison. \
9+ //! Set the TEMP_DIR const to keep the files. df_to_ files have small changes in them, search for '#'. \
10+ //! File generation up to 1 GB is really fast, Benchmarking above 100 MB takes very long.
11+
12+ /// Generate test files with these sizes in KB.
13+ const FILE_SIZE_KILO_BYTES : [ u64 ; 4 ] = [ 100 , 1 * MB , 10 * MB , 25 * MB ] ;
14+ // const FILE_SIZE_KILO_BYTES: [u64; 3] = [100, 1 * MB, 5 * MB];
315// Empty String to use TempDir (files will be removed after test) or specify dir to keep generated files
416const TEMP_DIR : & str = "" ;
5- const NUM_DIFF : usize = 4 ;
17+ const NUM_DIFF : u64 = 4 ;
18+ // just for FILE_SIZE_KILO_BYTES
19+ const MB : u64 = 1_000 ;
20+ const CHANGE_CHAR : u8 = b'#' ;
621
22+ #[ cfg( not( feature = "feat_bench_not_cmp" ) ) ]
723mod diffutils_cmp {
824 use std:: hint:: black_box;
925
1026 use diffutilslib:: cmp;
1127 use divan:: Bencher ;
1228
13- use crate :: { binary, prepare:: * , FILE_LINES } ;
14-
15- // // test the impact on the benchmark if not converting the cmd to Vec<OsString> (doubles for parse)
16- // #[divan::bench]
17- // fn parser_cmp_no_prepare() {
18- // let cmd = "cmd file_1.txt file_2.txt -bl n10M --ignore-initial=100KiB:1MiB";
19- // let args = str_to_options(&cmd).into_iter().peekable();
20- // let _ = cmp::parse_params(args);
21- // }
29+ use crate :: { binary, prepare:: * , FILE_SIZE_KILO_BYTES } ;
2230
23- #[ divan:: bench( args = FILE_LINES ) ]
24- fn diff_compare_files_equal ( bencher : Bencher , lines : usize ) {
25- let ( from, to) = get_context ( ) . get_test_files_equal ( lines ) ;
31+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
32+ fn cmp_compare_files_equal ( bencher : Bencher , kb : u64 ) {
33+ let ( from, to) = get_context ( ) . get_test_files_equal ( kb ) ;
2634 let cmd = format ! ( "cmp {from} {to}" ) ;
2735 let opts = str_to_options ( & cmd) . into_iter ( ) . peekable ( ) ;
2836 let params = cmp:: parse_params ( opts) . unwrap ( ) ;
37+
2938 bencher
3039 // .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
3140 . with_inputs ( || params. clone ( ) )
3241 . bench_refs ( |params| black_box ( cmp:: cmp ( & params) . unwrap ( ) ) ) ;
3342 }
3443
3544 // bench the actual compare; cmp exits on first difference
36- #[ divan:: bench( args = FILE_LINES ) ]
37- fn diff_compare_files_diff ( bencher : Bencher , lines : usize ) {
38- let ( from, to) = get_context ( ) . get_test_files_different ( lines ) ;
45+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
46+ fn cmp_compare_files_different ( bencher : Bencher , bytes : u64 ) {
47+ let ( from, to) = get_context ( ) . get_test_files_different ( bytes ) ;
3948 let cmd = format ! ( "cmp {from} {to} -s" ) ;
4049 let opts = str_to_options ( & cmd) . into_iter ( ) . peekable ( ) ;
4150 let params = cmp:: parse_params ( opts) . unwrap ( ) ;
@@ -47,9 +56,9 @@ mod diffutils_cmp {
4756 }
4857
4958 // bench original GNU cmp
50- #[ divan:: bench( args = FILE_LINES ) ]
51- fn cmd_cmp_gnu_equal ( bencher : Bencher , num_lines : usize ) {
52- let ( from, to) = get_context ( ) . get_test_files_equal ( num_lines ) ;
59+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
60+ fn cmd_cmp_gnu_equal ( bencher : Bencher , bytes : u64 ) {
61+ let ( from, to) = get_context ( ) . get_test_files_equal ( bytes ) ;
5362 let args_str = format ! ( "{from} {to}" ) ;
5463 bencher
5564 // .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
@@ -58,9 +67,9 @@ mod diffutils_cmp {
5867 }
5968
6069 // bench the compiled release version
61- #[ divan:: bench( args = FILE_LINES ) ]
62- fn cmd_cmp_release_equal ( bencher : Bencher , num_lines : usize ) {
63- let ( from, to) = get_context ( ) . get_test_files_equal ( num_lines ) ;
70+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
71+ fn cmd_cmp_release_equal ( bencher : Bencher , bytes : u64 ) {
72+ let ( from, to) = get_context ( ) . get_test_files_equal ( bytes ) ;
6473 let args_str = format ! ( "cmp {from} {to}" ) ;
6574
6675 bencher
@@ -70,17 +79,18 @@ mod diffutils_cmp {
7079 }
7180}
7281
82+ #[ cfg( not( feature = "feat_bench_not_diff" ) ) ]
7383mod diffutils_diff {
7484 // use std::hint::black_box;
7585
76- use crate :: { binary, prepare:: * , FILE_LINES } ;
86+ use crate :: { binary, prepare:: * , FILE_SIZE_KILO_BYTES } ;
7787 // use diffutilslib::params;
7888 use divan:: Bencher ;
7989
8090 // bench the actual compare
8191 // TODO diff does not have a diff function
8292 // #[divan::bench(args = [100_000,10_000])]
83- // fn diff_compare_files(bencher: Bencher, lines: usize ) {
93+ // fn diff_compare_files(bencher: Bencher, bytes: u64 ) {
8494 // let (from, to) = gen_testfiles(lines, 0, "id");
8595 // let cmd = format!("cmp {from} {to}");
8696 // let opts = str_to_options(&cmd).into_iter().peekable();
@@ -93,9 +103,9 @@ mod diffutils_diff {
93103 // }
94104
95105 // bench original GNU diff
96- #[ divan:: bench( args = FILE_LINES ) ]
97- fn cmd_diff_gnu_equal ( bencher : Bencher , num_lines : usize ) {
98- let ( from, to) = get_context ( ) . get_test_files_equal ( num_lines ) ;
106+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
107+ fn cmd_diff_gnu_equal ( bencher : Bencher , bytes : u64 ) {
108+ let ( from, to) = get_context ( ) . get_test_files_equal ( bytes ) ;
99109 let args_str = format ! ( "{from} {to}" ) ;
100110 bencher
101111 // .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
@@ -104,9 +114,9 @@ mod diffutils_diff {
104114 }
105115
106116 // bench the compiled release version
107- #[ divan:: bench( args = FILE_LINES ) ]
108- fn cmd_diff_release_equal ( bencher : Bencher , num_lines : usize ) {
109- let ( from, to) = get_context ( ) . get_test_files_equal ( num_lines ) ;
117+ #[ divan:: bench( args = FILE_SIZE_KILO_BYTES ) ]
118+ fn cmd_diff_release_equal ( bencher : Bencher , bytes : u64 ) {
119+ let ( from, to) = get_context ( ) . get_test_files_equal ( bytes ) ;
110120 let args_str = format ! ( "diff {from} {to}" ) ;
111121
112122 bencher
@@ -134,6 +144,14 @@ mod parser {
134144 . bench_values ( |data| black_box ( cmp:: parse_params ( data) ) ) ;
135145 }
136146
147+ // // test the impact on the benchmark if not converting the cmd to Vec<OsString> (doubles for parse)
148+ // #[divan::bench]
149+ // fn cmp_parser_no_prepare() {
150+ // let cmd = "cmd file_1.txt file_2.txt -bl n10M --ignore-initial=100KiB:1MiB";
151+ // let args = str_to_options(&cmd).into_iter().peekable();
152+ // let _ = cmp::parse_params(args);
153+ // }
154+
137155 // bench the time it takes to parse the command line arguments
138156 #[ divan:: bench]
139157 fn diff_parser ( bencher : Bencher ) {
@@ -157,12 +175,14 @@ mod prepare {
157175 use rand:: RngExt ;
158176 use tempfile:: TempDir ;
159177
160- use crate :: { FILE_LINES , NUM_DIFF , TEMP_DIR } ;
178+ use crate :: { CHANGE_CHAR , FILE_SIZE_KILO_BYTES , NUM_DIFF , TEMP_DIR } ;
161179
162180 // file lines and .txt will be added
163181 const FROM_FILE : & str = "from_file" ;
164182 const TO_FILE : & str = "to_file" ;
183+ const LINE_LENGTH : usize = 60 ;
165184
185+ /// Contains test data (file names) which only needs to be created once.
166186 #[ derive( Debug , Default ) ]
167187 pub struct BenchContext {
168188 pub tmp_dir : Option < TempDir > ,
@@ -179,20 +199,22 @@ mod prepare {
179199 }
180200 }
181201
182- pub fn get_test_files_equal ( & self , num_lines : usize ) -> & ( String , String ) {
183- let p = FILE_LINES . iter ( ) . position ( |f| * f == num_lines ) . unwrap ( ) ;
202+ pub fn get_test_files_equal ( & self , kb : u64 ) -> & ( String , String ) {
203+ let p = FILE_SIZE_KILO_BYTES . iter ( ) . position ( |f| * f == kb ) . unwrap ( ) ;
184204 & self . files_equal [ p]
185205 }
186206
187- pub fn get_test_files_different ( & self , num_lines : usize ) -> & ( String , String ) {
188- let p = FILE_LINES . iter ( ) . position ( |f| * f == num_lines) . unwrap ( ) ;
207+ #[ allow( unused) ]
208+ pub fn get_test_files_different ( & self , kb : u64 ) -> & ( String , String ) {
209+ let p = FILE_SIZE_KILO_BYTES . iter ( ) . position ( |f| * f == kb) . unwrap ( ) ;
189210 & self . files_different [ p]
190211 }
191212 }
192213
193214 // Since each bench function is separate in Divan it is more difficult to dynamically create test data.
194215 // This keeps the TempDir alive until the program exits and generates the files only once.
195216 static SHARED_CONTEXT : OnceLock < BenchContext > = OnceLock :: new ( ) ;
217+ /// Creates the test files once and provides them to all tests.
196218 pub fn get_context ( ) -> & ' static BenchContext {
197219 SHARED_CONTEXT . get_or_init ( || {
198220 let mut ctx = BenchContext :: default ( ) ;
@@ -208,12 +230,12 @@ mod prepare {
208230 ctx. dir = TEMP_DIR . to_string ( ) ;
209231 } ;
210232
211- // generate test files
212- for num_lines in FILE_LINES {
213- let f = generate_test_files ( ctx. get_path ( ) , num_lines , 0 , "eq" )
233+ // generate test bytes
234+ for kb in FILE_SIZE_KILO_BYTES {
235+ let f = generate_test_files_bytes ( ctx. get_path ( ) , kb * 1000 , 0 , "eq" )
214236 . expect ( "generate_test_files failed" ) ;
215237 ctx. files_equal . push ( f) ;
216- let f = generate_test_files ( ctx. get_path ( ) , num_lines , NUM_DIFF , "df" )
238+ let f = generate_test_files_bytes ( ctx. get_path ( ) , kb * 1000 , NUM_DIFF , "df" )
217239 . expect ( "generate_test_files failed" ) ;
218240 ctx. files_different . push ( f) ;
219241 }
@@ -233,55 +255,54 @@ mod prepare {
233255 s
234256 }
235257
236- // Generates the test files and returns the from and to file names.
237- #[ allow( unused) ]
238- pub fn gen_testfiles ( num_lines : usize , num_diff : usize , id : & str ) -> ( String , String ) {
239- let dir = get_context ( ) . get_path ( ) ;
240- generate_test_files ( dir, num_lines, num_diff, id) . expect ( "generate_test_files failed" )
241- }
242-
243- /// Generates two test files for comparison.
258+ /// Generates two test files for comparison with <bytes> size.
244259 ///
245260 /// Each line consists of 10 words with 5 letters, giving a line length of 60 bytes.
246- /// If num_differences is set, '* ' will be inserted between the first two words of a line,
261+ /// If num_differences is set, '# ' will be inserted between the first two words of a line,
247262 /// evenly spaced in the file. 1 will add the change in the last line, so the comparison takes longest.
248- fn generate_test_files (
263+ fn generate_test_files_bytes (
249264 dir : & Path ,
250- num_lines : usize ,
251- num_differences : usize ,
265+ bytes : u64 ,
266+ num_differences : u64 ,
252267 id : & str ,
253268 ) -> std:: io:: Result < ( String , String ) > {
254269 let id = if id. is_empty ( ) {
255270 "" . to_string ( )
256271 } else {
257272 format ! ( "{id}_" )
258273 } ;
259- let f1 = format ! ( "{id}{FROM_FILE}_{num_lines }.txt" ) ;
260- let f2 = format ! ( "{id}{TO_FILE}_{num_lines }.txt" ) ;
274+ let f1 = format ! ( "{id}{FROM_FILE}_{bytes }.txt" ) ;
275+ let f2 = format ! ( "{id}{TO_FILE}_{bytes }.txt" ) ;
261276 let from_path = dir. join ( f1) ;
262277 let to_path = dir. join ( f2) ;
263278
264- generate_file_fast ( & from_path, & to_path, num_lines , num_differences) ?;
279+ generate_file_bytes ( & from_path, & to_path, bytes , num_differences) ?;
265280
266281 Ok ( (
267282 from_path. to_string_lossy ( ) . to_string ( ) ,
268283 to_path. to_string_lossy ( ) . to_string ( ) ,
269284 ) )
270285 }
271286
272- // Largely Gemini AI
273- fn generate_file_fast (
287+ fn generate_file_bytes (
274288 from_name : & Path ,
275289 to_name : & Path ,
276- line_count : usize ,
277- num_differences : usize ,
290+ bytes : u64 ,
291+ num_differences : u64 ,
278292 ) -> std:: io:: Result < ( ) > {
279293 let file_from = File :: create ( from_name) ?;
280294 let file_to = File :: create ( to_name) ?;
281- let change = if num_differences == 0 {
295+ // for int division, lines will be smaller than requested bytes
296+ let n_lines = bytes / LINE_LENGTH as u64 ;
297+ let change_every_n_lines = if num_differences == 0 {
282298 0
283299 } else {
284- line_count / num_differences
300+ let c = n_lines / num_differences;
301+ if c == 0 {
302+ 1
303+ } else {
304+ c
305+ }
285306 } ;
286307 // Use a larger 128KB buffer for massive files
287308 let mut writer_from = BufWriter :: with_capacity ( 128 * 1024 , file_from) ;
@@ -292,7 +313,7 @@ mod prepare {
292313 let mut line_buffer = [ b' ' ; 60 ] ;
293314 line_buffer[ 59 ] = b'\n' ; // Set the newline once at the end
294315
295- for i in ( 0 ..line_count ) . rev ( ) {
316+ for i in ( 0 ..n_lines ) . rev ( ) {
296317 // Fill only the letter positions, skipping spaces and the newline
297318 for word_idx in 0 ..10 {
298319 let start = word_idx * 6 ; // Each word + space block is 6 bytes
@@ -307,14 +328,28 @@ mod prepare {
307328 if num_differences == 0 {
308329 writer_to. write_all ( & line_buffer) ?;
309330 } else {
310- if i % change == 0 {
311- line_buffer[ 5 ] = b'*' ;
331+ if i % change_every_n_lines == 0 && n_lines - i > 2 {
332+ line_buffer[ 5 ] = CHANGE_CHAR ;
312333 }
313334 writer_to. write_all ( & line_buffer) ?;
314335 line_buffer[ 5 ] = b' ' ;
315336 }
316337 }
317338
339+ // create last line
340+ let missing = ( bytes - n_lines as u64 * LINE_LENGTH as u64 ) as usize ;
341+ if missing > 0 {
342+ for word_idx in 0 ..10 {
343+ let start = word_idx * 6 ; // Each word + space block is 6 bytes
344+ for i in 0 ..5 {
345+ line_buffer[ start + i] = rng. random_range ( b'a' ..b'z' + 1 ) ;
346+ }
347+ }
348+ line_buffer[ missing - 1 ] = b'\n' ;
349+ writer_from. write_all ( & line_buffer[ 0 ..missing] ) ?;
350+ writer_to. write_all ( & line_buffer[ 0 ..missing] ) ?;
351+ }
352+
318353 writer_from. flush ( ) ?;
319354 writer_to. flush ( ) ?;
320355
0 commit comments