Skip to content

Commit 5865c10

Browse files
committed
paste: support multi-byte delimiters and GNU escape sequences
1 parent ba1afb0 commit 5865c10

6 files changed

Lines changed: 245 additions & 62 deletions

File tree

src/uu/paste/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ path = "src/paste.rs"
1919

2020
[dependencies]
2121
clap = { workspace = true }
22-
uucore = { workspace = true }
22+
uucore = { workspace = true, features = ["i18n-charmap"] }
2323
fluent = { workspace = true }
2424

2525
[[bin]]

src/uu/paste/src/paste.rs

Lines changed: 40 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ use std::ffi::OsString;
99
use std::fs::File;
1010
use std::io::{BufRead, BufReader, Stdin, Write, stdin, stdout};
1111
use std::iter::Cycle;
12+
use std::os::unix::ffi::OsStringExt;
1213
use std::path::Path;
1314
use std::rc::Rc;
1415
use std::slice::Iter;
1516
use uucore::error::{UResult, USimpleError};
1617
use uucore::format_usage;
18+
use uucore::i18n::charmap::mb_char_len;
1719
use uucore::line_ending::LineEnding;
1820
use uucore::translate;
1921

@@ -29,7 +31,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
2931
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
3032

3133
let serial = matches.get_flag(options::SERIAL);
32-
let delimiters = matches.get_one::<String>(options::DELIMITER).unwrap();
34+
let delimiters = matches.get_one::<OsString>(options::DELIMITER).unwrap();
3335
let files = matches
3436
.get_many::<OsString>(options::FILE)
3537
.unwrap()
@@ -61,7 +63,8 @@ pub fn uu_app() -> Command {
6163
.help(translate!("paste-help-delimiter"))
6264
.value_name("LIST")
6365
.default_value("\t")
64-
.hide_default_value(true),
66+
.hide_default_value(true)
67+
.value_parser(clap::value_parser!(OsString)),
6568
)
6669
.arg(
6770
Arg::new(options::FILE)
@@ -84,7 +87,7 @@ pub fn uu_app() -> Command {
8487
fn paste(
8588
filenames: Vec<OsString>,
8689
serial: bool,
87-
delimiters: &str,
90+
delimiters: &OsString,
8891
line_ending: LineEnding,
8992
) -> UResult<()> {
9093
let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
@@ -185,65 +188,42 @@ fn paste(
185188
Ok(())
186189
}
187190

188-
fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
189-
/// A single backslash char
190-
const BACKSLASH: char = '\\';
191-
192-
fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
193-
vec.push(Box::new([byte]));
194-
}
195-
196-
// a buffer of length four is large enough to encode any char
197-
let mut buffer = [0; 4];
198-
199-
let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
200-
let delimiter_encoded = ch.encode_utf8(&mut buffer);
201-
202-
vec.push(Box::<[u8]>::from(delimiter_encoded.as_bytes()));
203-
};
204-
205-
let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());
206-
207-
let mut chars = delimiters.chars();
208-
209-
// Unescape all special characters
210-
while let Some(char) = chars.next() {
211-
match char {
212-
BACKSLASH => match chars.next() {
213-
// "Empty string (not a null character)"
214-
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
215-
Some('0') => {
216-
vec.push(Box::<[u8; 0]>::new([]));
217-
}
218-
// "\\" to "\" (U+005C)
219-
Some(BACKSLASH) => {
220-
add_one_byte_single_char_delimiter(&mut vec, b'\\');
221-
}
222-
// "\n" to U+000A
223-
Some('n') => {
224-
add_one_byte_single_char_delimiter(&mut vec, b'\n');
225-
}
226-
// "\t" to U+0009
227-
Some('t') => {
228-
add_one_byte_single_char_delimiter(&mut vec, b'\t');
229-
}
230-
Some(other_char) => {
231-
// "If any other characters follow the <backslash>, the results are unspecified."
232-
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
233-
// However, other implementations remove the backslash
234-
// See "test_posix_unspecified_delimiter"
235-
add_single_char_delimiter(&mut vec, other_char);
236-
}
237-
None => {
238-
return Err(USimpleError::new(
239-
1,
240-
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters),
241-
));
191+
fn parse_delimiters(delimiters: &OsString) -> UResult<Box<[Box<[u8]>]>> {
192+
let bytes = delimiters.clone().into_vec();
193+
let mut vec = Vec::<Box<[u8]>>::with_capacity(bytes.len());
194+
let mut i = 0;
195+
196+
while i < bytes.len() {
197+
if bytes[i] == b'\\' {
198+
i += 1;
199+
if i >= bytes.len() {
200+
return Err(USimpleError::new(
201+
1,
202+
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()),
203+
));
204+
}
205+
match bytes[i] {
206+
b'0' => vec.push(Box::new([])),
207+
b'\\' => vec.push(Box::new([b'\\'])),
208+
b'n' => vec.push(Box::new([b'\n'])),
209+
b't' => vec.push(Box::new([b'\t'])),
210+
b'b' => vec.push(Box::new([b'\x08'])),
211+
b'f' => vec.push(Box::new([b'\x0C'])),
212+
b'r' => vec.push(Box::new([b'\r'])),
213+
b'v' => vec.push(Box::new([b'\x0B'])),
214+
_ => {
215+
// Unknown escape: strip backslash, use the following character(s)
216+
let len = mb_char_len(&bytes[i..]);
217+
vec.push(Box::from(&bytes[i..i + len]));
218+
i += len;
219+
continue;
242220
}
243-
},
244-
non_backslash_char => {
245-
add_single_char_delimiter(&mut vec, non_backslash_char);
246221
}
222+
i += 1;
223+
} else {
224+
let len = mb_char_len(&bytes[i..]);
225+
vec.push(Box::from(&bytes[i..i + len]));
226+
i += len;
247227
}
248228
}
249229

src/uucore/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ format = [
150150
"quoting-style",
151151
"unit-prefix",
152152
]
153-
i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"]
153+
i18n-all = ["i18n-charmap", "i18n-collator", "i18n-decimal", "i18n-datetime"]
154+
i18n-charmap = ["i18n-common"]
154155
i18n-common = ["icu_locale"]
155156
i18n-collator = ["i18n-common", "icu_collator"]
156157
i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"]
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// This file is part of the uutils coreutils package.
2+
//
3+
// For the full copyright and license information, please view the LICENSE
4+
// file that was distributed with this source code.
5+
6+
// spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata
7+
8+
//! Locale-aware multi-byte character length detection via `LC_CTYPE`.
9+
10+
use std::sync::OnceLock;
11+
12+
enum MbEncoding {
13+
Utf8,
14+
Gb18030,
15+
EucJp,
16+
EucKr,
17+
Big5,
18+
}
19+
20+
fn encoding_from_name(enc: &str) -> MbEncoding {
21+
match enc {
22+
"gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
23+
"euc-jp" | "eucjp" => MbEncoding::EucJp,
24+
"euc-kr" | "euckr" => MbEncoding::EucKr,
25+
"big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
26+
_ => MbEncoding::Utf8,
27+
}
28+
}
29+
30+
fn get_encoding() -> &'static MbEncoding {
31+
static ENCODING: OnceLock<MbEncoding> = OnceLock::new();
32+
ENCODING.get_or_init(|| {
33+
let val = ["LC_ALL", "LC_CTYPE", "LANG"]
34+
.iter()
35+
.find_map(|&k| std::env::var(k).ok());
36+
let s = match val.as_deref() {
37+
Some(s) if !s.is_empty() && s != "C" && s != "POSIX" => s,
38+
_ => return MbEncoding::Utf8,
39+
};
40+
if let Some(enc) = s.split('.').nth(1) {
41+
let enc = enc.split('@').next().unwrap_or(enc);
42+
encoding_from_name(&enc.to_ascii_lowercase())
43+
} else {
44+
// Bare locale defaults from glibc localedata/SUPPORTED
45+
match s.split('@').next().unwrap_or(s) {
46+
"zh_CN" | "zh_SG" => MbEncoding::Gb18030,
47+
"zh_TW" | "zh_HK" => MbEncoding::Big5,
48+
_ => MbEncoding::Utf8,
49+
}
50+
}
51+
})
52+
}
53+
54+
/// Byte length of the first character in `bytes` under the current locale encoding.
55+
/// Returns 1 for empty, invalid, or incomplete sequences.
56+
pub fn mb_char_len(bytes: &[u8]) -> usize {
57+
if bytes.is_empty() {
58+
return 1;
59+
}
60+
let b0 = bytes[0];
61+
if b0 <= 0x7F {
62+
return 1;
63+
}
64+
match get_encoding() {
65+
MbEncoding::Utf8 => utf8_len(bytes, b0),
66+
MbEncoding::Gb18030 => gb18030_len(bytes, b0),
67+
MbEncoding::EucJp => eucjp_len(bytes, b0),
68+
MbEncoding::EucKr => euckr_len(bytes, b0),
69+
MbEncoding::Big5 => big5_len(bytes, b0),
70+
}
71+
}
72+
73+
// All helpers below assume b0 > 0x7F (ASCII already handled by caller).
74+
75+
fn utf8_len(b: &[u8], b0: u8) -> usize {
76+
let n = match b0 {
77+
0xC2..=0xDF => 2,
78+
0xE0..=0xEF => 3,
79+
0xF0..=0xF4 => 4,
80+
_ => return 1,
81+
};
82+
if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) {
83+
n
84+
} else {
85+
1
86+
}
87+
}
88+
89+
// 2-byte: [81-FE][40-7E,80-FE] 4-byte: [81-FE][30-39][81-FE][30-39]
90+
fn gb18030_len(b: &[u8], b0: u8) -> usize {
91+
if !(0x81..=0xFE).contains(&b0) {
92+
return 1;
93+
}
94+
if b.len() >= 4
95+
&& (0x30..=0x39).contains(&b[1])
96+
&& (0x81..=0xFE).contains(&b[2])
97+
&& (0x30..=0x39).contains(&b[3])
98+
{
99+
return 4;
100+
}
101+
if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) {
102+
return 2;
103+
}
104+
1
105+
}
106+
107+
// 3-byte: [8F][A1-FE][A1-FE] 2-byte: [8E][A1-DF] or [A1-FE][A1-FE]
108+
fn eucjp_len(b: &[u8], b0: u8) -> usize {
109+
if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2])
110+
{
111+
return 3;
112+
}
113+
if b.len() >= 2 {
114+
if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) {
115+
return 2;
116+
}
117+
if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) {
118+
return 2;
119+
}
120+
}
121+
1
122+
}
123+
124+
// 2-byte: [A1-FE][A1-FE]
125+
fn euckr_len(b: &[u8], b0: u8) -> usize {
126+
if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) {
127+
2
128+
} else {
129+
1
130+
}
131+
}
132+
133+
// 2-byte: [81-FE][40-7E,A1-FE]
134+
fn big5_len(b: &[u8], b0: u8) -> usize {
135+
if (0x81..=0xFE).contains(&b0)
136+
&& b.len() >= 2
137+
&& ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1]))
138+
{
139+
2
140+
} else {
141+
1
142+
}
143+
}

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use std::sync::OnceLock;
77

88
use icu_locale::{Locale, locale};
99

10+
#[cfg(feature = "i18n-charmap")]
11+
pub mod charmap;
1012
#[cfg(feature = "i18n-collator")]
1113
pub mod collator;
1214
#[cfg(feature = "i18n-datetime")]

tests/by-util/test_paste.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,30 @@ const EXAMPLE_DATA: &[TestData] = &[
135135
ins: &["1 \na \n", "2\t\nb\t\n"],
136136
out: "1 |2\t\na |b\t\n",
137137
},
138+
TestData {
139+
name: "utf8-2byte-delim",
140+
args: &["-d", "\u{00A2}"],
141+
ins: &["1\n2\n", "a\nb\n"],
142+
out: "1\u{00A2}a\n2\u{00A2}b\n",
143+
},
144+
TestData {
145+
name: "utf8-3byte-delim",
146+
args: &["-d", "\u{20AC}"],
147+
ins: &["1\n2\n", "a\nb\n"],
148+
out: "1\u{20AC}a\n2\u{20AC}b\n",
149+
},
150+
TestData {
151+
name: "utf8-4byte-delim",
152+
args: &["-d", "\u{1F600}", "-s"],
153+
ins: &["1\n2\n3\n"],
154+
out: "1\u{1F600}2\u{1F600}3\n",
155+
},
156+
TestData {
157+
name: "utf8-multi-delim-cycle",
158+
args: &["-d", "\u{00A2}\u{20AC}"],
159+
ins: &["a\nb\nc\n", "1\n2\n3\n", "x\ny\nz\n"],
160+
out: "a\u{00A2}1\u{20AC}x\nb\u{00A2}2\u{20AC}y\nc\u{00A2}3\u{20AC}z\n",
161+
},
138162
];
139163

140164
#[test]
@@ -334,6 +358,24 @@ fn test_backslash_zero_delimiter() {
334358
}
335359
}
336360

361+
#[test]
362+
fn test_gnu_escape_sequences() {
363+
let cases: &[(&str, u8)] = &[
364+
(r"\b", 0x08),
365+
(r"\f", 0x0C),
366+
(r"\r", 0x0D),
367+
(r"\v", 0x0B),
368+
];
369+
for &(esc, byte) in cases {
370+
let expected = [b'1', byte, b'2', byte, b'3', b'\n'];
371+
new_ucmd!()
372+
.args(&["-s", "-d", esc])
373+
.pipe_in("1\n2\n3\n")
374+
.succeeds()
375+
.stdout_only_bytes(expected);
376+
}
377+
}
378+
337379
// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
338380
// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
339381
#[test]
@@ -378,6 +420,21 @@ fn test_data() {
378420
}
379421
}
380422

423+
#[test]
424+
#[cfg(target_os = "linux")]
425+
fn test_non_utf8_delimiter() {
426+
let (at, mut ucmd) = at_and_ucmd!();
427+
at.write("f1", "1\n2\n");
428+
at.write("f2", "a\nb\n");
429+
let delim = std::ffi::OsString::from_vec(vec![0xA2, 0xE3]);
430+
ucmd.env("LC_ALL", "zh_CN.gb18030")
431+
.arg("-d")
432+
.arg(&delim)
433+
.args(&["f1", "f2"])
434+
.succeeds()
435+
.stdout_only_bytes(b"1\xA2\xE3a\n2\xA2\xE3b\n");
436+
}
437+
381438
#[test]
382439
#[cfg(target_os = "linux")]
383440
fn test_paste_non_utf8_paths() {

0 commit comments

Comments
 (0)