Skip to content

Commit 1cd2467

Browse files
authored
Merge branch 'main' into Redundant-statx-syscalls
2 parents 64eb894 + d830e84 commit 1cd2467

11 files changed

Lines changed: 147 additions & 52 deletions

File tree

.github/workflows/code-quality.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ jobs:
116116
;;
117117
esac
118118
- name: "`cargo clippy` lint testing"
119-
uses: nick-fields/retry@v3
119+
uses: nick-fields/retry@v4
120120
with:
121121
max_attempts: 3
122122
retry_on: error

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/uu/cut/src/cut.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,9 @@ fn cut_fields_newline_char_delim<R: Read, W: Write>(
260260
reader: R,
261261
out: &mut W,
262262
ranges: &[Range],
263-
only_delimited: bool,
264263
newline_char: u8,
265264
out_delim: &[u8],
265+
only_delimited: bool,
266266
) -> UResult<()> {
267267
let mut reader = BufReader::new(reader);
268268
let mut line = Vec::new();
@@ -398,9 +398,9 @@ fn cut_fields<R: Read, W: Write>(
398398
reader,
399399
out,
400400
ranges,
401-
field_opts.only_delimited,
402401
newline_char,
403402
out_delim,
403+
field_opts.only_delimited,
404404
)
405405
}
406406
Delimiter::Slice(delim) => {

src/uu/sort/src/chunks.rs

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,36 @@ pub struct LineData<'a> {
5353
pub num_infos: Vec<NumInfo>,
5454
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
5555
pub line_num_floats: Vec<Option<f64>>,
56+
/// Arena buffer holding all collation sort keys concatenated.
57+
pub collation_key_buffer: Vec<u8>,
58+
/// End offsets into `collation_key_buffer` for each line's sort key.
59+
pub collation_key_ends: Vec<usize>,
60+
}
61+
62+
impl LineData<'_> {
63+
/// Get the collation sort key for a line at the given index.
64+
pub fn collation_key(&self, index: usize) -> &[u8] {
65+
let start = if index == 0 {
66+
0
67+
} else {
68+
self.collation_key_ends[index - 1]
69+
};
70+
let end = self.collation_key_ends[index];
71+
&self.collation_key_buffer[start..end]
72+
}
5673
}
5774

5875
impl Chunk {
5976
/// Destroy this chunk and return its components to be reused.
6077
pub fn recycle(mut self) -> RecycledChunk {
61-
let recycled_contents = self.with_dependent_mut(|_, contents| {
78+
let mut recycled_contents = self.with_dependent_mut(|_, contents| {
6279
contents.lines.clear();
6380
contents.line_data.selections.clear();
6481
contents.line_data.num_infos.clear();
6582
contents.line_data.parsed_floats.clear();
6683
contents.line_data.line_num_floats.clear();
84+
contents.line_data.collation_key_buffer.clear();
85+
contents.line_data.collation_key_ends.clear();
6786
contents.token_buffer.clear();
6887
let lines = unsafe {
6988
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
@@ -81,26 +100,22 @@ impl Chunk {
81100
&mut contents.line_data.selections,
82101
))
83102
};
84-
(
103+
RecycledChunk {
85104
lines,
86105
selections,
87-
std::mem::take(&mut contents.line_data.num_infos),
88-
std::mem::take(&mut contents.line_data.parsed_floats),
89-
std::mem::take(&mut contents.line_data.line_num_floats),
90-
std::mem::take(&mut contents.token_buffer),
91-
contents.line_count_hint,
92-
)
106+
num_infos: std::mem::take(&mut contents.line_data.num_infos),
107+
parsed_floats: std::mem::take(&mut contents.line_data.parsed_floats),
108+
line_num_floats: std::mem::take(&mut contents.line_data.line_num_floats),
109+
collation_key_buffer: std::mem::take(&mut contents.line_data.collation_key_buffer),
110+
collation_key_ends: std::mem::take(&mut contents.line_data.collation_key_ends),
111+
token_buffer: std::mem::take(&mut contents.token_buffer),
112+
line_count_hint: contents.line_count_hint,
113+
// buffer is set below after we consume `self`
114+
buffer: Vec::new(),
115+
}
93116
});
94-
RecycledChunk {
95-
lines: recycled_contents.0,
96-
selections: recycled_contents.1,
97-
num_infos: recycled_contents.2,
98-
parsed_floats: recycled_contents.3,
99-
line_num_floats: recycled_contents.4,
100-
token_buffer: recycled_contents.5,
101-
line_count_hint: recycled_contents.6,
102-
buffer: self.into_owner(),
103-
}
117+
recycled_contents.buffer = self.into_owner();
118+
recycled_contents
104119
}
105120

106121
pub fn lines(&self) -> &Vec<Line<'_>> {
@@ -118,6 +133,8 @@ pub struct RecycledChunk {
118133
num_infos: Vec<NumInfo>,
119134
parsed_floats: Vec<GeneralBigDecimalParseResult>,
120135
line_num_floats: Vec<Option<f64>>,
136+
collation_key_buffer: Vec<u8>,
137+
collation_key_ends: Vec<usize>,
121138
token_buffer: Vec<Range<usize>>,
122139
line_count_hint: usize,
123140
buffer: Vec<u8>,
@@ -131,6 +148,8 @@ impl RecycledChunk {
131148
num_infos: Vec::new(),
132149
parsed_floats: Vec::new(),
133150
line_num_floats: Vec::new(),
151+
collation_key_buffer: Vec::new(),
152+
collation_key_ends: Vec::new(),
134153
token_buffer: Vec::new(),
135154
line_count_hint: 0,
136155
buffer: vec![0; capacity],
@@ -176,6 +195,8 @@ pub fn read<T: Read>(
176195
num_infos,
177196
parsed_floats,
178197
line_num_floats,
198+
collation_key_buffer,
199+
collation_key_ends,
179200
mut token_buffer,
180201
mut line_count_hint,
181202
mut buffer,
@@ -214,6 +235,8 @@ pub fn read<T: Read>(
214235
num_infos,
215236
parsed_floats,
216237
line_num_floats,
238+
collation_key_buffer,
239+
collation_key_ends,
217240
};
218241
parse_lines(
219242
read,
@@ -253,6 +276,8 @@ fn parse_lines<'a>(
253276
assert!(line_data.num_infos.is_empty());
254277
assert!(line_data.parsed_floats.is_empty());
255278
assert!(line_data.line_num_floats.is_empty());
279+
assert!(line_data.collation_key_buffer.is_empty());
280+
assert!(line_data.collation_key_ends.is_empty());
256281
token_buffer.clear();
257282
const SMALL_CHUNK_BYTES: usize = 64 * 1024;
258283
let mut estimated = (*line_count_hint).max(1);

src/uu/sort/src/sort.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ use uucore::error::{FromIo, strip_errno};
4848
use uucore::error::{UError, UResult, USimpleError, UUsageError};
4949
use uucore::extendedbigdecimal::ExtendedBigDecimal;
5050
#[cfg(feature = "i18n-collator")]
51-
use uucore::i18n::collator::locale_cmp;
51+
use uucore::i18n::collator::{compute_sort_key_utf8, locale_cmp};
5252
use uucore::i18n::decimal::locale_decimal_separator;
5353
use uucore::line_ending::LineEnding;
5454
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
@@ -324,6 +324,7 @@ struct Precomputed {
324324
floats_per_line: usize,
325325
selections_per_line: usize,
326326
fast_lexicographic: bool,
327+
fast_locale_collation: bool,
327328
fast_ascii_insensitive: bool,
328329
tokenize_blank_thousands_sep: bool,
329330
tokenize_allow_unit_after_blank: bool,
@@ -387,6 +388,8 @@ impl GlobalSettings {
387388

388389
self.precomputed.fast_lexicographic =
389390
!disable_fast_lexicographic && self.can_use_fast_lexicographic();
391+
self.precomputed.fast_locale_collation =
392+
disable_fast_lexicographic && self.can_use_fast_lexicographic();
390393
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
391394
}
392395

@@ -632,6 +635,15 @@ impl<'a> Line<'a> {
632635
token_buffer: &mut Vec<Field>,
633636
settings: &GlobalSettings,
634637
) -> Self {
638+
#[cfg(feature = "i18n-collator")]
639+
if settings.precomputed.fast_locale_collation {
640+
compute_sort_key_utf8(line, &mut line_data.collation_key_buffer);
641+
line_data
642+
.collation_key_ends
643+
.push(line_data.collation_key_buffer.len());
644+
return Self { line, index };
645+
}
646+
635647
let needs_line_data = settings.precomputed.needs_tokens
636648
|| settings.precomputed.selections_per_line > 0
637649
|| settings.precomputed.num_infos_per_line > 0
@@ -2614,6 +2626,18 @@ fn compare_by<'a>(
26142626
};
26152627
}
26162628

2629+
#[cfg(feature = "i18n-collator")]
2630+
if global_settings.precomputed.fast_locale_collation {
2631+
let a_key = a_line_data.collation_key(a.index);
2632+
let b_key = b_line_data.collation_key(b.index);
2633+
let cmp = a_key.cmp(b_key);
2634+
return if global_settings.reverse {
2635+
cmp.reverse()
2636+
} else {
2637+
cmp
2638+
};
2639+
}
2640+
26172641
if global_settings.precomputed.fast_ascii_insensitive {
26182642
let cmp = ascii_case_insensitive_cmp(a.line, b.line);
26192643
if cmp != Ordering::Equal || a.line == b.line {

src/uu/tr/locales/en-US.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ tr-warning-invalid-utf8 = invalid utf8 sequence
2727
2828
# Sequence parsing error messages
2929
tr-error-missing-char-class-name = missing character class name '[::]'
30+
tr-error-invalid-char-class = invalid character class { $class }
3031
tr-error-missing-equivalence-class-char = missing equivalence class character '[==]'
3132
tr-error-multiple-char-repeat-in-set2 = only one [c*] repeat construct may appear in string2
3233
tr-error-char-repeat-in-set1 = the [c*] repeat construct may not appear in string1

src/uu/tr/locales/fr-FR.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ tr-warning-invalid-utf8 = séquence UTF-8 non valide
2828
2929
# Messages d'erreur d'analyse de séquence
3030
tr-error-missing-char-class-name = nom de classe de caractères manquant '[::]'
31+
tr-error-invalid-char-class = classe de caractères non valide { $class }
3132
tr-error-missing-equivalence-class-char = caractère de classe d'équivalence manquant '[==]'
3233
tr-error-multiple-char-repeat-in-set2 = seule une construction de répétition [c*] peut apparaître dans string2
3334
tr-error-char-repeat-in-set1 = la construction de répétition [c*] ne peut pas apparaître dans string1

src/uu/tr/src/operation.rs

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pub trait ChunkProcessor {
3434
#[derive(Debug, Clone)]
3535
pub enum BadSequence {
3636
MissingCharClassName,
37+
InvalidCharClass(String),
3738
MissingEquivalentClassChar,
3839
MultipleCharRepeatInSet2,
3940
CharRepeatInSet1,
@@ -53,6 +54,13 @@ impl Display for BadSequence {
5354
Self::MissingCharClassName => {
5455
write!(f, "{}", translate!("tr-error-missing-char-class-name"))
5556
}
57+
Self::InvalidCharClass(class) => {
58+
write!(
59+
f,
60+
"{}",
61+
translate!("tr-error-invalid-char-class", "class" => format!("'{}'", class))
62+
)
63+
}
5664
Self::MissingEquivalentClassChar => {
5765
write!(
5866
f,
@@ -499,31 +507,32 @@ impl Sequence {
499507
}
500508

501509
fn parse_class(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
502-
delimited(
503-
tag("[:"),
504-
alt((
505-
map(
506-
alt((
507-
value(Self::Class(Class::Alnum), tag("alnum")),
508-
value(Self::Class(Class::Alpha), tag("alpha")),
509-
value(Self::Class(Class::Blank), tag("blank")),
510-
value(Self::Class(Class::Control), tag("cntrl")),
511-
value(Self::Class(Class::Digit), tag("digit")),
512-
value(Self::Class(Class::Graph), tag("graph")),
513-
value(Self::Class(Class::Lower), tag("lower")),
514-
value(Self::Class(Class::Print), tag("print")),
515-
value(Self::Class(Class::Punct), tag("punct")),
516-
value(Self::Class(Class::Space), tag("space")),
517-
value(Self::Class(Class::Upper), tag("upper")),
518-
value(Self::Class(Class::Xdigit), tag("xdigit")),
519-
)),
520-
Ok,
521-
),
522-
value(Err(BadSequence::MissingCharClassName), tag("")),
523-
)),
524-
tag(":]"),
525-
)
526-
.parse(input)
510+
preceded(tag("[:"), terminated(take_until(":]"), tag(":]")))
511+
.parse(input)
512+
.map(|(l, class_name)| {
513+
(
514+
l,
515+
match class_name {
516+
b"" => Err(BadSequence::MissingCharClassName),
517+
b"alnum" => Ok(Self::Class(Class::Alnum)),
518+
b"alpha" => Ok(Self::Class(Class::Alpha)),
519+
b"blank" => Ok(Self::Class(Class::Blank)),
520+
b"cntrl" => Ok(Self::Class(Class::Control)),
521+
b"digit" => Ok(Self::Class(Class::Digit)),
522+
b"graph" => Ok(Self::Class(Class::Graph)),
523+
b"lower" => Ok(Self::Class(Class::Lower)),
524+
b"print" => Ok(Self::Class(Class::Print)),
525+
b"punct" => Ok(Self::Class(Class::Punct)),
526+
b"space" => Ok(Self::Class(Class::Space)),
527+
b"upper" => Ok(Self::Class(Class::Upper)),
528+
b"xdigit" => Ok(Self::Class(Class::Xdigit)),
529+
_ => Err(BadSequence::InvalidCharClass(format!(
530+
"[:{}:]",
531+
String::from_utf8_lossy(class_name)
532+
))),
533+
},
534+
)
535+
})
527536
}
528537

529538
fn parse_char_equal(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {

src/uucore/src/lib/features/i18n/collator.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,17 @@ pub fn init_locale_collation() -> bool {
7474
try_init_collator(opts)
7575
}
7676

77+
/// Compute the ICU collation sort key for the given input bytes and append it to `buf`.
78+
/// This allows pre-computing sort keys once per line, then comparing them with simple
79+
/// byte comparison during sorting (much faster than calling `compare_utf8` per comparison).
80+
pub fn compute_sort_key_utf8(input: &[u8], buf: &mut Vec<u8>) {
81+
let c = COLLATOR
82+
.get()
83+
.expect("compute_sort_key_utf8 called before collator initialization");
84+
c.write_sort_key_utf8_to(input, buf)
85+
.expect("ICU write_sort_key_utf8_to failed");
86+
}
87+
7788
/// Compare both strings with regard to the current locale.
7889
pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
7990
// If the detected locale is 'C', just do byte-wise comparison

tests/by-util/test_cut.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,21 @@ fn test_zero_terminated_only_delimited() {
229229
.stdout_only("82\n7\0");
230230
}
231231

232+
#[test]
233+
fn test_suppresses_unterminated_segment() {
234+
new_ucmd!()
235+
.args(&["-z", "-d", "", "-s", "-f", "1"])
236+
.pipe_in("unterminated")
237+
.succeeds()
238+
.stdout_only_bytes("");
239+
240+
new_ucmd!()
241+
.args(&["-z", "-d", "", "-s", "-f", "1"])
242+
.pipe_in("terminated\0unterminated")
243+
.succeeds()
244+
.stdout_only_bytes("terminated\0");
245+
}
246+
232247
#[test]
233248
fn test_is_a_directory() {
234249
let (at, mut ucmd) = at_and_ucmd!();

0 commit comments

Comments
 (0)