Skip to content

Commit 87c332c

Browse files
authored
sort : gnu core utils test (sort-merge-fdlimit.sh) (#9849)
1 parent 525d1f8 commit 87c332c

9 files changed

Lines changed: 605 additions & 303 deletions

File tree

Cargo.lock

Lines changed: 326 additions & 278 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deny.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ skip = [
8989
{ name = "itertools", version = "0.13.0" },
9090
# ordered-multimap
9191
{ name = "hashbrown", version = "0.14.5" },
92+
# lru (via num-prime)
93+
{ name = "hashbrown", version = "0.15.5" },
9294
# cexpr (via bindgen)
9395
{ name = "nom", version = "7.1.3" },
9496
# const-random-macro, rand_core

src/uu/sort/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ bigdecimal = { workspace = true }
2727
binary-heap-plus = { workspace = true }
2828
clap = { workspace = true }
2929
compare = { workspace = true }
30-
ctrlc = { workspace = true }
3130
fnv = { workspace = true }
3231
itertools = { workspace = true }
3332
memchr = { workspace = true }
@@ -46,6 +45,9 @@ uucore = { workspace = true, features = [
4645
] }
4746
fluent = { workspace = true }
4847

48+
[target.'cfg(not(target_os = "redox"))'.dependencies]
49+
ctrlc = { workspace = true }
50+
4951
[target.'cfg(unix)'.dependencies]
5052
nix = { workspace = true, features = ["resource"] }
5153

src/uu/sort/locales/en-US.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ sort-help-numeric = compare according to string numerical value
8585
sort-help-general-numeric = compare according to string general numerical value
8686
sort-help-version-sort = Sort by SemVer version number, eg 1.12.2 > 1.1.2
8787
sort-help-random = shuffle in random order
88+
sort-help-random-source = use FILE as a source of random data
8889
sort-help-dictionary-order = consider only blanks and alphanumeric characters
8990
sort-help-merge = merge already sorted files; do not sort
9091
sort-help-check = check for sorted input; do not sort

src/uu/sort/locales/fr-FR.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ sort-help-numeric = compare selon la valeur numérique de la chaîne
6969
sort-help-general-numeric = compare selon la valeur numérique générale de la chaîne
7070
sort-help-version-sort = Trie par numéro de version SemVer, par ex. 1.12.2 > 1.1.2
7171
sort-help-random = mélange dans un ordre aléatoire
72+
sort-help-random-source = utilise FICHIER comme source de données aléatoires
7273
sort-help-dictionary-order = considère seulement les espaces et les caractères alphanumériques
7374
sort-help-merge = fusionne les fichiers déjà triés ; ne trie pas
7475
sort-help-check = vérifie l'entrée triée ; ne trie pas

src/uu/sort/src/chunks.rs

Lines changed: 87 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,26 @@
55

66
//! Utilities for reading files as chunks.
77
8+
// spell-checker:ignore ELEMS
89
#![allow(dead_code)]
910
// Ignores non-used warning for `borrow_buffer` in `Chunk`
1011

1112
use std::{
1213
io::{ErrorKind, Read},
14+
ops::Range,
1315
sync::mpsc::SyncSender,
1416
};
1517

1618
use memchr::memchr_iter;
1719
use self_cell::self_cell;
1820
use uucore::error::{UResult, USimpleError};
1921

20-
use crate::{GeneralBigDecimalParseResult, GlobalSettings, Line, numeric_str_cmp::NumInfo};
22+
use crate::{
23+
GeneralBigDecimalParseResult, GlobalSettings, Line, SortMode, numeric_str_cmp::NumInfo,
24+
};
25+
26+
const MAX_TOKEN_BUFFER_BYTES: usize = 4 * 1024 * 1024;
27+
const MAX_TOKEN_BUFFER_ELEMS: usize = MAX_TOKEN_BUFFER_BYTES / std::mem::size_of::<Range<usize>>();
2128

2229
self_cell!(
2330
/// The chunk that is passed around between threads.
@@ -35,6 +42,8 @@ self_cell!(
3542
pub struct ChunkContents<'a> {
3643
pub lines: Vec<Line<'a>>,
3744
pub line_data: LineData<'a>,
45+
pub token_buffer: Vec<Range<usize>>,
46+
pub line_count_hint: usize,
3847
}
3948

4049
#[derive(Debug)]
@@ -54,6 +63,7 @@ impl Chunk {
5463
contents.line_data.num_infos.clear();
5564
contents.line_data.parsed_floats.clear();
5665
contents.line_data.line_num_floats.clear();
66+
contents.token_buffer.clear();
5767
let lines = unsafe {
5868
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
5969
// because the vector is empty.
@@ -76,6 +86,8 @@ impl Chunk {
7686
std::mem::take(&mut contents.line_data.num_infos),
7787
std::mem::take(&mut contents.line_data.parsed_floats),
7888
std::mem::take(&mut contents.line_data.line_num_floats),
89+
std::mem::take(&mut contents.token_buffer),
90+
contents.line_count_hint,
7991
)
8092
});
8193
RecycledChunk {
@@ -84,6 +96,8 @@ impl Chunk {
8496
num_infos: recycled_contents.2,
8597
parsed_floats: recycled_contents.3,
8698
line_num_floats: recycled_contents.4,
99+
token_buffer: recycled_contents.5,
100+
line_count_hint: recycled_contents.6,
87101
buffer: self.into_owner(),
88102
}
89103
}
@@ -103,6 +117,8 @@ pub struct RecycledChunk {
103117
num_infos: Vec<NumInfo>,
104118
parsed_floats: Vec<GeneralBigDecimalParseResult>,
105119
line_num_floats: Vec<Option<f64>>,
120+
token_buffer: Vec<Range<usize>>,
121+
line_count_hint: usize,
106122
buffer: Vec<u8>,
107123
}
108124

@@ -114,6 +130,8 @@ impl RecycledChunk {
114130
num_infos: Vec::new(),
115131
parsed_floats: Vec::new(),
116132
line_num_floats: Vec::new(),
133+
token_buffer: Vec::new(),
134+
line_count_hint: 0,
117135
buffer: vec![0; capacity],
118136
}
119137
}
@@ -157,6 +175,8 @@ pub fn read<T: Read>(
157175
num_infos,
158176
parsed_floats,
159177
line_num_floats,
178+
mut token_buffer,
179+
mut line_count_hint,
160180
mut buffer,
161181
} = recycled_chunk;
162182
if buffer.len() < carry_over.len() {
@@ -193,8 +213,21 @@ pub fn read<T: Read>(
193213
parsed_floats,
194214
line_num_floats,
195215
};
196-
parse_lines(read, &mut lines, &mut line_data, separator, settings);
197-
Ok(ChunkContents { lines, line_data })
216+
parse_lines(
217+
read,
218+
&mut lines,
219+
&mut line_data,
220+
&mut token_buffer,
221+
&mut line_count_hint,
222+
separator,
223+
settings,
224+
);
225+
Ok(ChunkContents {
226+
lines,
227+
line_data,
228+
token_buffer,
229+
line_count_hint,
230+
})
198231
});
199232
sender.send(payload?).unwrap();
200233
}
@@ -206,6 +239,8 @@ fn parse_lines<'a>(
206239
read: &'a [u8],
207240
lines: &mut Vec<Line<'a>>,
208241
line_data: &mut LineData<'a>,
242+
token_buffer: &mut Vec<Range<usize>>,
243+
line_count_hint: &mut usize,
209244
separator: u8,
210245
settings: &GlobalSettings,
211246
) {
@@ -216,12 +251,55 @@ fn parse_lines<'a>(
216251
assert!(line_data.num_infos.is_empty());
217252
assert!(line_data.parsed_floats.is_empty());
218253
assert!(line_data.line_num_floats.is_empty());
219-
let mut token_buffer = vec![];
220-
lines.extend(
221-
read.split(|&c| c == separator)
222-
.enumerate()
223-
.map(|(index, line)| Line::create(line, index, line_data, &mut token_buffer, settings)),
224-
);
254+
token_buffer.clear();
255+
if token_buffer.capacity() > MAX_TOKEN_BUFFER_ELEMS {
256+
token_buffer.shrink_to(MAX_TOKEN_BUFFER_ELEMS);
257+
}
258+
const SMALL_CHUNK_BYTES: usize = 64 * 1024;
259+
let mut estimated = (*line_count_hint).max(1);
260+
let mut exact_line_count = None;
261+
if *line_count_hint == 0 || read.len() <= SMALL_CHUNK_BYTES {
262+
let count = if read.is_empty() {
263+
1
264+
} else {
265+
memchr_iter(separator, read).count() + 1
266+
};
267+
exact_line_count = Some(count);
268+
estimated = count;
269+
} else if estimated == 1 {
270+
const LINE_LEN_HINT: usize = 32;
271+
estimated = (read.len() / LINE_LEN_HINT).max(1);
272+
}
273+
lines.reserve(estimated);
274+
if settings.precomputed.selections_per_line > 0 {
275+
line_data
276+
.selections
277+
.reserve(estimated.saturating_mul(settings.precomputed.selections_per_line));
278+
}
279+
if settings.precomputed.num_infos_per_line > 0 {
280+
line_data
281+
.num_infos
282+
.reserve(estimated.saturating_mul(settings.precomputed.num_infos_per_line));
283+
}
284+
if settings.precomputed.floats_per_line > 0 {
285+
line_data
286+
.parsed_floats
287+
.reserve(estimated.saturating_mul(settings.precomputed.floats_per_line));
288+
}
289+
if settings.mode == SortMode::Numeric {
290+
line_data.line_num_floats.reserve(estimated);
291+
}
292+
let mut start = 0usize;
293+
let mut index = 0usize;
294+
for sep_idx in memchr_iter(separator, read) {
295+
let line = &read[start..sep_idx];
296+
lines.push(Line::create(line, index, line_data, token_buffer, settings));
297+
index += 1;
298+
start = sep_idx + 1;
299+
}
300+
let line = &read[start..];
301+
lines.push(Line::create(line, index, line_data, token_buffer, settings));
302+
*line_count_hint = exact_line_count.unwrap_or(index + 1);
225303
}
226304

227305
/// Read from `file` into `buffer`.

src/uu/sort/src/merge.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use uucore::error::{FromIo, UResult};
3030
use crate::{
3131
GlobalSettings, Output, SortError,
3232
chunks::{self, Chunk, RecycledChunk},
33-
compare_by, fd_soft_limit, open,
33+
compare_by, current_open_fd_count, fd_soft_limit, open,
3434
tmp_dir::TmpDirWrapper,
3535
};
3636

@@ -66,14 +66,19 @@ fn replace_output_file_in_input_files(
6666
/// file-descriptor soft limit after reserving stdio/output and a safety margin.
6767
fn effective_merge_batch_size(settings: &GlobalSettings) -> usize {
6868
const MIN_BATCH_SIZE: usize = 2;
69-
const RESERVED_STDIO: usize = 3;
70-
const RESERVED_OUTPUT: usize = 1;
69+
const RESERVED_TMP_OUTPUT: usize = 1;
70+
const RESERVED_CTRL_C: usize = 2;
71+
const RESERVED_RANDOM_SOURCE: usize = 1;
7172
const SAFETY_MARGIN: usize = 1;
7273
let mut batch_size = settings.merge_batch_size.max(MIN_BATCH_SIZE);
7374

7475
if let Some(limit) = fd_soft_limit() {
75-
let reserved = RESERVED_STDIO + RESERVED_OUTPUT + SAFETY_MARGIN;
76-
let available_inputs = limit.saturating_sub(reserved);
76+
let open_fds = current_open_fd_count().unwrap_or(3);
77+
let mut reserved = RESERVED_TMP_OUTPUT + RESERVED_CTRL_C + SAFETY_MARGIN;
78+
if settings.salt.is_some() {
79+
reserved = reserved.saturating_add(RESERVED_RANDOM_SOURCE);
80+
}
81+
let available_inputs = limit.saturating_sub(open_fds.saturating_add(reserved));
7782
if available_inputs >= MIN_BATCH_SIZE {
7883
batch_size = batch_size.min(available_inputs);
7984
} else {

0 commit comments

Comments
 (0)