Skip to content

Commit 143fec3

Browse files
committed
paste: support multi-byte delimiters and GNU escape sequences
1 parent ba1afb0 commit 143fec3

7 files changed

Lines changed: 242 additions & 62 deletions

File tree

src/uu/paste/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ path = "src/paste.rs"
1919

2020
[dependencies]
2121
clap = { workspace = true }
22-
uucore = { workspace = true }
22+
uucore = { workspace = true, features = ["i18n-charmap"] }
2323
fluent = { workspace = true }
2424

2525
[[bin]]

src/uu/paste/src/paste.rs

Lines changed: 39 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::rc::Rc;
1414
use std::slice::Iter;
1515
use uucore::error::{UResult, USimpleError};
1616
use uucore::format_usage;
17+
use uucore::i18n::charmap::mb_char_len;
1718
use uucore::line_ending::LineEnding;
1819
use uucore::translate;
1920

@@ -29,7 +30,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
2930
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
3031

3132
let serial = matches.get_flag(options::SERIAL);
32-
let delimiters = matches.get_one::<String>(options::DELIMITER).unwrap();
33+
let delimiters = matches.get_one::<OsString>(options::DELIMITER).unwrap();
3334
let files = matches
3435
.get_many::<OsString>(options::FILE)
3536
.unwrap()
@@ -61,7 +62,8 @@ pub fn uu_app() -> Command {
6162
.help(translate!("paste-help-delimiter"))
6263
.value_name("LIST")
6364
.default_value("\t")
64-
.hide_default_value(true),
65+
.hide_default_value(true)
66+
.value_parser(clap::value_parser!(OsString)),
6567
)
6668
.arg(
6769
Arg::new(options::FILE)
@@ -84,7 +86,7 @@ pub fn uu_app() -> Command {
8486
fn paste(
8587
filenames: Vec<OsString>,
8688
serial: bool,
87-
delimiters: &str,
89+
delimiters: &OsString,
8890
line_ending: LineEnding,
8991
) -> UResult<()> {
9092
let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
@@ -185,65 +187,42 @@ fn paste(
185187
Ok(())
186188
}
187189

188-
fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
189-
/// A single backslash char
190-
const BACKSLASH: char = '\\';
191-
192-
fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
193-
vec.push(Box::new([byte]));
194-
}
195-
196-
// a buffer of length four is large enough to encode any char
197-
let mut buffer = [0; 4];
198-
199-
let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
200-
let delimiter_encoded = ch.encode_utf8(&mut buffer);
201-
202-
vec.push(Box::<[u8]>::from(delimiter_encoded.as_bytes()));
203-
};
204-
205-
let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());
206-
207-
let mut chars = delimiters.chars();
208-
209-
// Unescape all special characters
210-
while let Some(char) = chars.next() {
211-
match char {
212-
BACKSLASH => match chars.next() {
213-
// "Empty string (not a null character)"
214-
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
215-
Some('0') => {
216-
vec.push(Box::<[u8; 0]>::new([]));
217-
}
218-
// "\\" to "\" (U+005C)
219-
Some(BACKSLASH) => {
220-
add_one_byte_single_char_delimiter(&mut vec, b'\\');
221-
}
222-
// "\n" to U+000A
223-
Some('n') => {
224-
add_one_byte_single_char_delimiter(&mut vec, b'\n');
225-
}
226-
// "\t" to U+0009
227-
Some('t') => {
228-
add_one_byte_single_char_delimiter(&mut vec, b'\t');
229-
}
230-
Some(other_char) => {
231-
// "If any other characters follow the <backslash>, the results are unspecified."
232-
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
233-
// However, other implementations remove the backslash
234-
// See "test_posix_unspecified_delimiter"
235-
add_single_char_delimiter(&mut vec, other_char);
236-
}
237-
None => {
238-
return Err(USimpleError::new(
239-
1,
240-
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters),
241-
));
190+
fn parse_delimiters(delimiters: &OsString) -> UResult<Box<[Box<[u8]>]>> {
191+
let bytes = uucore::os_string_to_vec(delimiters.clone())?;
192+
let mut vec = Vec::<Box<[u8]>>::with_capacity(bytes.len());
193+
let mut i = 0;
194+
195+
while i < bytes.len() {
196+
if bytes[i] == b'\\' {
197+
i += 1;
198+
if i >= bytes.len() {
199+
return Err(USimpleError::new(
200+
1,
201+
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()),
202+
));
203+
}
204+
match bytes[i] {
205+
b'0' => vec.push(Box::new([])),
206+
b'\\' => vec.push(Box::new([b'\\'])),
207+
b'n' => vec.push(Box::new([b'\n'])),
208+
b't' => vec.push(Box::new([b'\t'])),
209+
b'b' => vec.push(Box::new([b'\x08'])),
210+
b'f' => vec.push(Box::new([b'\x0C'])),
211+
b'r' => vec.push(Box::new([b'\r'])),
212+
b'v' => vec.push(Box::new([b'\x0B'])),
213+
_ => {
214+
// Unknown escape: strip backslash, use the following character(s)
215+
let len = mb_char_len(&bytes[i..]);
216+
vec.push(Box::from(&bytes[i..i + len]));
217+
i += len;
218+
continue;
242219
}
243-
},
244-
non_backslash_char => {
245-
add_single_char_delimiter(&mut vec, non_backslash_char);
246220
}
221+
i += 1;
222+
} else {
223+
let len = mb_char_len(&bytes[i..]);
224+
vec.push(Box::from(&bytes[i..i + len]));
225+
i += len;
247226
}
248227
}
249228

src/uucore/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ format = [
150150
"quoting-style",
151151
"unit-prefix",
152152
]
153-
i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"]
153+
i18n-all = ["i18n-charmap", "i18n-collator", "i18n-decimal", "i18n-datetime"]
154+
i18n-charmap = ["i18n-common"]
154155
i18n-common = ["icu_locale"]
155156
i18n-collator = ["i18n-common", "icu_collator"]
156157
i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"]
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// This file is part of the uutils coreutils package.
2+
//
3+
// For the full copyright and license information, please view the LICENSE
4+
// file that was distributed with this source code.
5+
6+
// spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata
7+
8+
//! Locale-aware multi-byte character length detection via `LC_CTYPE`.
9+
10+
use std::sync::OnceLock;
11+
12+
enum MbEncoding {
13+
Utf8,
14+
Gb18030,
15+
EucJp,
16+
EucKr,
17+
Big5,
18+
}
19+
20+
fn encoding_from_name(enc: &str) -> MbEncoding {
21+
match enc {
22+
"gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
23+
"euc-jp" | "eucjp" => MbEncoding::EucJp,
24+
"euc-kr" | "euckr" => MbEncoding::EucKr,
25+
"big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
26+
_ => MbEncoding::Utf8,
27+
}
28+
}
29+
30+
fn get_encoding() -> &'static MbEncoding {
31+
static ENCODING: OnceLock<MbEncoding> = OnceLock::new();
32+
ENCODING.get_or_init(|| {
33+
let val = ["LC_ALL", "LC_CTYPE", "LANG"]
34+
.iter()
35+
.find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
36+
let s = match val.as_deref() {
37+
Some(s) if s != "C" && s != "POSIX" => s,
38+
_ => return MbEncoding::Utf8,
39+
};
40+
if let Some(enc) = s.split('.').nth(1) {
41+
let enc = enc.split('@').next().unwrap_or(enc);
42+
encoding_from_name(&enc.to_ascii_lowercase())
43+
} else {
44+
// Bare locale defaults from glibc localedata/SUPPORTED
45+
match s.split('@').next().unwrap_or(s) {
46+
"zh_CN" | "zh_SG" => MbEncoding::Gb18030,
47+
"zh_TW" | "zh_HK" => MbEncoding::Big5,
48+
_ => MbEncoding::Utf8,
49+
}
50+
}
51+
})
52+
}
53+
54+
/// Byte length of the first character in `bytes` under the current locale encoding.
55+
/// Returns 1 for empty, invalid, or incomplete sequences.
56+
pub fn mb_char_len(bytes: &[u8]) -> usize {
57+
if bytes.is_empty() {
58+
return 1;
59+
}
60+
let b0 = bytes[0];
61+
if b0 <= 0x7F {
62+
return 1;
63+
}
64+
match get_encoding() {
65+
MbEncoding::Utf8 => utf8_len(bytes, b0),
66+
MbEncoding::Gb18030 => gb18030_len(bytes, b0),
67+
MbEncoding::EucJp => eucjp_len(bytes, b0),
68+
MbEncoding::EucKr => euckr_len(bytes, b0),
69+
MbEncoding::Big5 => big5_len(bytes, b0),
70+
}
71+
}
72+
73+
// All helpers below assume b0 > 0x7F (ASCII already handled by caller).
74+
75+
fn utf8_len(b: &[u8], b0: u8) -> usize {
76+
let n = match b0 {
77+
0xC2..=0xDF => 2,
78+
0xE0..=0xEF => 3,
79+
0xF0..=0xF4 => 4,
80+
_ => return 1,
81+
};
82+
if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) {
83+
n
84+
} else {
85+
1
86+
}
87+
}
88+
89+
// 2-byte: [81-FE][40-7E,80-FE] 4-byte: [81-FE][30-39][81-FE][30-39]
90+
fn gb18030_len(b: &[u8], b0: u8) -> usize {
91+
if !(0x81..=0xFE).contains(&b0) {
92+
return 1;
93+
}
94+
if b.len() >= 4
95+
&& (0x30..=0x39).contains(&b[1])
96+
&& (0x81..=0xFE).contains(&b[2])
97+
&& (0x30..=0x39).contains(&b[3])
98+
{
99+
return 4;
100+
}
101+
if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) {
102+
return 2;
103+
}
104+
1
105+
}
106+
107+
// 3-byte: [8F][A1-FE][A1-FE] 2-byte: [8E][A1-DF] or [A1-FE][A1-FE]
108+
fn eucjp_len(b: &[u8], b0: u8) -> usize {
109+
if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2])
110+
{
111+
return 3;
112+
}
113+
if b.len() >= 2 {
114+
if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) {
115+
return 2;
116+
}
117+
if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) {
118+
return 2;
119+
}
120+
}
121+
1
122+
}
123+
124+
// 2-byte: [A1-FE][A1-FE]
125+
fn euckr_len(b: &[u8], b0: u8) -> usize {
126+
if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) {
127+
2
128+
} else {
129+
1
130+
}
131+
}
132+
133+
// 2-byte: [81-FE][40-7E,A1-FE]
134+
fn big5_len(b: &[u8], b0: u8) -> usize {
135+
if (0x81..=0xFE).contains(&b0)
136+
&& b.len() >= 2
137+
&& ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1]))
138+
{
139+
2
140+
} else {
141+
1
142+
}
143+
}

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use std::sync::OnceLock;
77

88
use icu_locale::{Locale, locale};
99

10+
#[cfg(feature = "i18n-charmap")]
11+
pub mod charmap;
1012
#[cfg(feature = "i18n-collator")]
1113
pub mod collator;
1214
#[cfg(feature = "i18n-datetime")]

tests/by-util/test_paste.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,30 @@ const EXAMPLE_DATA: &[TestData] = &[
135135
ins: &["1 \na \n", "2\t\nb\t\n"],
136136
out: "1 |2\t\na |b\t\n",
137137
},
138+
TestData {
139+
name: "utf8-2byte-delim",
140+
args: &["-d", "\u{00A2}"],
141+
ins: &["1\n2\n", "a\nb\n"],
142+
out: "1\u{00A2}a\n2\u{00A2}b\n",
143+
},
144+
TestData {
145+
name: "utf8-3byte-delim",
146+
args: &["-d", "\u{20AC}"],
147+
ins: &["1\n2\n", "a\nb\n"],
148+
out: "1\u{20AC}a\n2\u{20AC}b\n",
149+
},
150+
TestData {
151+
name: "utf8-4byte-delim",
152+
args: &["-d", "\u{1F600}", "-s"],
153+
ins: &["1\n2\n3\n"],
154+
out: "1\u{1F600}2\u{1F600}3\n",
155+
},
156+
TestData {
157+
name: "utf8-multi-delim-cycle",
158+
args: &["-d", "\u{00A2}\u{20AC}"],
159+
ins: &["a\nb\nc\n", "1\n2\n3\n", "x\ny\nz\n"],
160+
out: "a\u{00A2}1\u{20AC}x\nb\u{00A2}2\u{20AC}y\nc\u{00A2}3\u{20AC}z\n",
161+
},
138162
];
139163

140164
#[test]
@@ -334,6 +358,19 @@ fn test_backslash_zero_delimiter() {
334358
}
335359
}
336360

361+
#[test]
362+
fn test_gnu_escape_sequences() {
363+
let cases: &[(&str, u8)] = &[(r"\b", 0x08), (r"\f", 0x0C), (r"\r", 0x0D), (r"\v", 0x0B)];
364+
for &(esc, byte) in cases {
365+
let expected = [b'1', byte, b'2', byte, b'3', b'\n'];
366+
new_ucmd!()
367+
.args(&["-s", "-d", esc])
368+
.pipe_in("1\n2\n3\n")
369+
.succeeds()
370+
.stdout_only_bytes(expected);
371+
}
372+
}
373+
337374
// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
338375
// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
339376
#[test]
@@ -378,6 +415,21 @@ fn test_data() {
378415
}
379416
}
380417

418+
#[test]
419+
#[cfg(target_os = "linux")]
420+
fn test_non_utf8_delimiter() {
421+
let (at, mut ucmd) = at_and_ucmd!();
422+
at.write("f1", "1\n2\n");
423+
at.write("f2", "a\nb\n");
424+
let delim = std::ffi::OsString::from_vec(vec![0xA2, 0xE3]);
425+
ucmd.env("LC_ALL", "zh_CN.gb18030")
426+
.arg("-d")
427+
.arg(&delim)
428+
.args(&["f1", "f2"])
429+
.succeeds()
430+
.stdout_only_bytes(b"1\xA2\xE3a\n2\xA2\xE3b\n");
431+
}
432+
381433
#[test]
382434
#[cfg(target_os = "linux")]
383435
fn test_paste_non_utf8_paths() {

util/build-gnu.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ fi
162162
grep -rl 'path_prepend_' tests/* | xargs -r "${SED}" -i 's| path_prepend_ ./src||'
163163
# path_prepend_ sets $abs_path_dir_: set it manually instead.
164164
grep -rl '\$abs_path_dir_' tests/*/*.sh | xargs -r "${SED}" -i "s|\$abs_path_dir_|${UU_BUILD_DIR//\//\\/}|g"
165+
# Some tests use $abs_top_builddir/src for shebangs: point them to the uutils build dir.
166+
grep -rl '\$abs_top_builddir/src' tests/*/*.sh tests/*/*.pl | xargs -r "${SED}" -i "s|\$abs_top_builddir/src|${UU_BUILD_DIR//\//\\/}|g"
167+
grep -rl '\$ENV{abs_top_builddir}/src' tests/*/*.pl | xargs -r "${SED}" -i "s|\$ENV{abs_top_builddir}/src|${UU_BUILD_DIR//\//\\/}|g"
165168

166169
# We can't build runcon and chcon without libselinux. But GNU no longer builds dummies of them. So consider they are SELinux specific.
167170
sed -i 's/^print_ver_.*/require_selinux_/' tests/runcon/runcon-compute.sh

0 commit comments

Comments
 (0)