Skip to content

Commit 725b89f

Browse files
author
Gunter Schmidt
committed
changed file num lines to file size in kb
1 parent fe9b622 commit 725b89f

3 files changed

Lines changed: 133 additions & 64 deletions

File tree

Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,10 @@ lto = "thin"
4747
name = "bench_diffutils"
4848
path = "benches/bench-diffutils.rs"
4949
harness = false
50+
51+
[features]
52+
# default = ["feat_bench_not_diff"]
53+
# Turn bench for diffutils cmp off
54+
feat_bench_not_cmp = []
55+
# Turn bench for diffutils diff off
56+
feat_bench_not_diff = []

benches/bench-diffutils.rs

Lines changed: 99 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,50 @@
1+
// This file is part of the uutils diffutils package.
2+
//
3+
// For the full copyright and license information, please view the LICENSE-*
4+
// files that was distributed with this source code.
5+
16
//! Benches for all utils in diffutils.
2-
const FILE_LINES: [usize; 3] = [10_000, 100_000, 500_000];
7+
//!
8+
//! There is a file generator included to create files of different sizes for comparison. \
9+
//! Set the TEMP_DIR const to keep the files. df_to_ files have small changes in them, search for '#'. \
10+
//! File generation up to 1 GB is really fast, Benchmarking above 100 MB takes very long.
11+
12+
/// Generate test files with these sizes in KB.
13+
const FILE_SIZE_KILO_BYTES: [u64; 4] = [100, 1 * MB, 10 * MB, 25 * MB];
14+
// const FILE_SIZE_KILO_BYTES: [u64; 3] = [100, 1 * MB, 5 * MB];
315
// Empty String to use TempDir (files will be removed after test) or specify dir to keep generated files
416
const TEMP_DIR: &str = "";
5-
const NUM_DIFF: usize = 4;
17+
const NUM_DIFF: u64 = 4;
18+
// just for FILE_SIZE_KILO_BYTES
19+
const MB: u64 = 1_000;
20+
const CHANGE_CHAR: u8 = b'#';
621

22+
#[cfg(not(feature = "feat_bench_not_cmp"))]
723
mod diffutils_cmp {
824
use std::hint::black_box;
925

1026
use diffutilslib::cmp;
1127
use divan::Bencher;
1228

13-
use crate::{binary, prepare::*, FILE_LINES};
14-
15-
// // test the impact on the benchmark if not converting the cmd to Vec<OsString> (doubles for parse)
16-
// #[divan::bench]
17-
// fn parser_cmp_no_prepare() {
18-
// let cmd = "cmd file_1.txt file_2.txt -bl n10M --ignore-initial=100KiB:1MiB";
19-
// let args = str_to_options(&cmd).into_iter().peekable();
20-
// let _ = cmp::parse_params(args);
21-
// }
29+
use crate::{binary, prepare::*, FILE_SIZE_KILO_BYTES};
2230

23-
#[divan::bench(args = FILE_LINES)]
24-
fn diff_compare_files_equal(bencher: Bencher, lines: usize) {
25-
let (from, to) = get_context().get_test_files_equal(lines);
31+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
32+
fn cmp_compare_files_equal(bencher: Bencher, kb: u64) {
33+
let (from, to) = get_context().get_test_files_equal(kb);
2634
let cmd = format!("cmp {from} {to}");
2735
let opts = str_to_options(&cmd).into_iter().peekable();
2836
let params = cmp::parse_params(opts).unwrap();
37+
2938
bencher
3039
// .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
3140
.with_inputs(|| params.clone())
3241
.bench_refs(|params| black_box(cmp::cmp(&params).unwrap()));
3342
}
3443

3544
// bench the actual compare; cmp exits on first difference
36-
#[divan::bench(args = FILE_LINES)]
37-
fn diff_compare_files_diff(bencher: Bencher, lines: usize) {
38-
let (from, to) = get_context().get_test_files_different(lines);
45+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
46+
fn cmp_compare_files_different(bencher: Bencher, bytes: u64) {
47+
let (from, to) = get_context().get_test_files_different(bytes);
3948
let cmd = format!("cmp {from} {to} -s");
4049
let opts = str_to_options(&cmd).into_iter().peekable();
4150
let params = cmp::parse_params(opts).unwrap();
@@ -47,9 +56,9 @@ mod diffutils_cmp {
4756
}
4857

4958
// bench original GNU cmp
50-
#[divan::bench(args = FILE_LINES)]
51-
fn cmd_cmp_gnu_equal(bencher: Bencher, num_lines: usize) {
52-
let (from, to) = get_context().get_test_files_equal(num_lines);
59+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
60+
fn cmd_cmp_gnu_equal(bencher: Bencher, bytes: u64) {
61+
let (from, to) = get_context().get_test_files_equal(bytes);
5362
let args_str = format!("{from} {to}");
5463
bencher
5564
// .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
@@ -58,9 +67,9 @@ mod diffutils_cmp {
5867
}
5968

6069
// bench the compiled release version
61-
#[divan::bench(args = FILE_LINES)]
62-
fn cmd_cmp_release_equal(bencher: Bencher, num_lines: usize) {
63-
let (from, to) = get_context().get_test_files_equal(num_lines);
70+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
71+
fn cmd_cmp_release_equal(bencher: Bencher, bytes: u64) {
72+
let (from, to) = get_context().get_test_files_equal(bytes);
6473
let args_str = format!("cmp {from} {to}");
6574

6675
bencher
@@ -70,17 +79,18 @@ mod diffutils_cmp {
7079
}
7180
}
7281

82+
#[cfg(not(feature = "feat_bench_not_diff"))]
7383
mod diffutils_diff {
7484
// use std::hint::black_box;
7585

76-
use crate::{binary, prepare::*, FILE_LINES};
86+
use crate::{binary, prepare::*, FILE_SIZE_KILO_BYTES};
7787
// use diffutilslib::params;
7888
use divan::Bencher;
7989

8090
// bench the actual compare
8191
// TODO diff does not have a diff function
8292
// #[divan::bench(args = [100_000,10_000])]
83-
// fn diff_compare_files(bencher: Bencher, lines: usize) {
93+
// fn diff_compare_files(bencher: Bencher, bytes: u64) {
8494
// let (from, to) = gen_testfiles(lines, 0, "id");
8595
// let cmd = format!("cmp {from} {to}");
8696
// let opts = str_to_options(&cmd).into_iter().peekable();
@@ -93,9 +103,9 @@ mod diffutils_diff {
93103
// }
94104

95105
// bench original GNU diff
96-
#[divan::bench(args = FILE_LINES)]
97-
fn cmd_diff_gnu_equal(bencher: Bencher, num_lines: usize) {
98-
let (from, to) = get_context().get_test_files_equal(num_lines);
106+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
107+
fn cmd_diff_gnu_equal(bencher: Bencher, bytes: u64) {
108+
let (from, to) = get_context().get_test_files_equal(bytes);
99109
let args_str = format!("{from} {to}");
100110
bencher
101111
// .with_inputs(|| prepare::cmp_params_identical_testfiles(lines))
@@ -104,9 +114,9 @@ mod diffutils_diff {
104114
}
105115

106116
// bench the compiled release version
107-
#[divan::bench(args = FILE_LINES)]
108-
fn cmd_diff_release_equal(bencher: Bencher, num_lines: usize) {
109-
let (from, to) = get_context().get_test_files_equal(num_lines);
117+
#[divan::bench(args = FILE_SIZE_KILO_BYTES)]
118+
fn cmd_diff_release_equal(bencher: Bencher, bytes: u64) {
119+
let (from, to) = get_context().get_test_files_equal(bytes);
110120
let args_str = format!("diff {from} {to}");
111121

112122
bencher
@@ -134,6 +144,14 @@ mod parser {
134144
.bench_values(|data| black_box(cmp::parse_params(data)));
135145
}
136146

147+
// // test the impact on the benchmark if not converting the cmd to Vec<OsString> (doubles for parse)
148+
// #[divan::bench]
149+
// fn cmp_parser_no_prepare() {
150+
// let cmd = "cmd file_1.txt file_2.txt -bl n10M --ignore-initial=100KiB:1MiB";
151+
// let args = str_to_options(&cmd).into_iter().peekable();
152+
// let _ = cmp::parse_params(args);
153+
// }
154+
137155
// bench the time it takes to parse the command line arguments
138156
#[divan::bench]
139157
fn diff_parser(bencher: Bencher) {
@@ -157,12 +175,14 @@ mod prepare {
157175
use rand::RngExt;
158176
use tempfile::TempDir;
159177

160-
use crate::{FILE_LINES, NUM_DIFF, TEMP_DIR};
178+
use crate::{CHANGE_CHAR, FILE_SIZE_KILO_BYTES, NUM_DIFF, TEMP_DIR};
161179

162180
// file lines and .txt will be added
163181
const FROM_FILE: &str = "from_file";
164182
const TO_FILE: &str = "to_file";
183+
const LINE_LENGTH: usize = 60;
165184

185+
/// Contains test data (file names) which only needs to be created once.
166186
#[derive(Debug, Default)]
167187
pub struct BenchContext {
168188
pub tmp_dir: Option<TempDir>,
@@ -179,20 +199,22 @@ mod prepare {
179199
}
180200
}
181201

182-
pub fn get_test_files_equal(&self, num_lines: usize) -> &(String, String) {
183-
let p = FILE_LINES.iter().position(|f| *f == num_lines).unwrap();
202+
pub fn get_test_files_equal(&self, kb: u64) -> &(String, String) {
203+
let p = FILE_SIZE_KILO_BYTES.iter().position(|f| *f == kb).unwrap();
184204
&self.files_equal[p]
185205
}
186206

187-
pub fn get_test_files_different(&self, num_lines: usize) -> &(String, String) {
188-
let p = FILE_LINES.iter().position(|f| *f == num_lines).unwrap();
207+
#[allow(unused)]
208+
pub fn get_test_files_different(&self, kb: u64) -> &(String, String) {
209+
let p = FILE_SIZE_KILO_BYTES.iter().position(|f| *f == kb).unwrap();
189210
&self.files_different[p]
190211
}
191212
}
192213

193214
// Since each bench function is separate in Divan it is more difficult to dynamically create test data.
194215
// This keeps the TempDir alive until the program exits and generates the files only once.
195216
static SHARED_CONTEXT: OnceLock<BenchContext> = OnceLock::new();
217+
/// Creates the test files once and provides them to all tests.
196218
pub fn get_context() -> &'static BenchContext {
197219
SHARED_CONTEXT.get_or_init(|| {
198220
let mut ctx = BenchContext::default();
@@ -208,12 +230,12 @@ mod prepare {
208230
ctx.dir = TEMP_DIR.to_string();
209231
};
210232

211-
// generate test files
212-
for num_lines in FILE_LINES {
213-
let f = generate_test_files(ctx.get_path(), num_lines, 0, "eq")
233+
// generate test bytes
234+
for kb in FILE_SIZE_KILO_BYTES {
235+
let f = generate_test_files_bytes(ctx.get_path(), kb * 1000, 0, "eq")
214236
.expect("generate_test_files failed");
215237
ctx.files_equal.push(f);
216-
let f = generate_test_files(ctx.get_path(), num_lines, NUM_DIFF, "df")
238+
let f = generate_test_files_bytes(ctx.get_path(), kb * 1000, NUM_DIFF, "df")
217239
.expect("generate_test_files failed");
218240
ctx.files_different.push(f);
219241
}
@@ -233,55 +255,54 @@ mod prepare {
233255
s
234256
}
235257

236-
// Generates the test files and returns the from and to file names.
237-
#[allow(unused)]
238-
pub fn gen_testfiles(num_lines: usize, num_diff: usize, id: &str) -> (String, String) {
239-
let dir = get_context().get_path();
240-
generate_test_files(dir, num_lines, num_diff, id).expect("generate_test_files failed")
241-
}
242-
243-
/// Generates two test files for comparison.
258+
/// Generates two test files for comparison with <bytes> size.
244259
///
245260
/// Each line consists of 10 words with 5 letters, giving a line length of 60 bytes.
246-
/// If num_differences is set, '*' will be inserted between the first two words of a line,
261+
/// If num_differences is set, '#' will be inserted between the first two words of a line,
247262
/// evenly spaced in the file. 1 will add the change in the last line, so the comparison takes longest.
248-
fn generate_test_files(
263+
fn generate_test_files_bytes(
249264
dir: &Path,
250-
num_lines: usize,
251-
num_differences: usize,
265+
bytes: u64,
266+
num_differences: u64,
252267
id: &str,
253268
) -> std::io::Result<(String, String)> {
254269
let id = if id.is_empty() {
255270
"".to_string()
256271
} else {
257272
format!("{id}_")
258273
};
259-
let f1 = format!("{id}{FROM_FILE}_{num_lines}.txt");
260-
let f2 = format!("{id}{TO_FILE}_{num_lines}.txt");
274+
let f1 = format!("{id}{FROM_FILE}_{bytes}.txt");
275+
let f2 = format!("{id}{TO_FILE}_{bytes}.txt");
261276
let from_path = dir.join(f1);
262277
let to_path = dir.join(f2);
263278

264-
generate_file_fast(&from_path, &to_path, num_lines, num_differences)?;
279+
generate_file_bytes(&from_path, &to_path, bytes, num_differences)?;
265280

266281
Ok((
267282
from_path.to_string_lossy().to_string(),
268283
to_path.to_string_lossy().to_string(),
269284
))
270285
}
271286

272-
// Largely Gemini AI
273-
fn generate_file_fast(
287+
fn generate_file_bytes(
274288
from_name: &Path,
275289
to_name: &Path,
276-
line_count: usize,
277-
num_differences: usize,
290+
bytes: u64,
291+
num_differences: u64,
278292
) -> std::io::Result<()> {
279293
let file_from = File::create(from_name)?;
280294
let file_to = File::create(to_name)?;
281-
let change = if num_differences == 0 {
295+
// for int division, lines will be smaller than requested bytes
296+
let n_lines = bytes / LINE_LENGTH as u64;
297+
let change_every_n_lines = if num_differences == 0 {
282298
0
283299
} else {
284-
line_count / num_differences
300+
let c = n_lines / num_differences;
301+
if c == 0 {
302+
1
303+
} else {
304+
c
305+
}
285306
};
286307
// Use a larger 128KB buffer for massive files
287308
let mut writer_from = BufWriter::with_capacity(128 * 1024, file_from);
@@ -292,7 +313,7 @@ mod prepare {
292313
let mut line_buffer = [b' '; 60];
293314
line_buffer[59] = b'\n'; // Set the newline once at the end
294315

295-
for i in (0..line_count).rev() {
316+
for i in (0..n_lines).rev() {
296317
// Fill only the letter positions, skipping spaces and the newline
297318
for word_idx in 0..10 {
298319
let start = word_idx * 6; // Each word + space block is 6 bytes
@@ -307,14 +328,28 @@ mod prepare {
307328
if num_differences == 0 {
308329
writer_to.write_all(&line_buffer)?;
309330
} else {
310-
if i % change == 0 {
311-
line_buffer[5] = b'*';
331+
if i % change_every_n_lines == 0 && n_lines - i > 2 {
332+
line_buffer[5] = CHANGE_CHAR;
312333
}
313334
writer_to.write_all(&line_buffer)?;
314335
line_buffer[5] = b' ';
315336
}
316337
}
317338

339+
// create last line
340+
let missing = (bytes - n_lines as u64 * LINE_LENGTH as u64) as usize;
341+
if missing > 0 {
342+
for word_idx in 0..10 {
343+
let start = word_idx * 6; // Each word + space block is 6 bytes
344+
for i in 0..5 {
345+
line_buffer[start + i] = rng.random_range(b'a'..b'z' + 1);
346+
}
347+
}
348+
line_buffer[missing - 1] = b'\n';
349+
writer_from.write_all(&line_buffer[0..missing])?;
350+
writer_to.write_all(&line_buffer[0..missing])?;
351+
}
352+
318353
writer_from.flush()?;
319354
writer_to.flush()?;
320355

fuzz/Cargo.lock

Lines changed: 27 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)