paste: support multi-byte delimiters and GNU escape sequences

ChrisDryden · ChrisDryden · commit 143fec30c96e · 2026-02-09T21:22:41.000Z
diff --git a/src/uu/paste/Cargo.toml b/src/uu/paste/Cargo.toml
@@ -19,7 +19,7 @@ path = "src/paste.rs"
 
 [dependencies]
 clap = { workspace = true }
-uucore = { workspace = true }
+uucore = { workspace = true, features = ["i18n-charmap"] }
 fluent = { workspace = true }
 
 [[bin]]
diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs
@@ -14,6 +14,7 @@ use std::rc::Rc;
 use std::slice::Iter;
 use uucore::error::{UResult, USimpleError};
 use uucore::format_usage;
+use uucore::i18n::charmap::mb_char_len;
 use uucore::line_ending::LineEnding;
 use uucore::translate;
 
@@ -29,7 +30,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
     let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
 
     let serial = matches.get_flag(options::SERIAL);
-    let delimiters = matches.get_one::<String>(options::DELIMITER).unwrap();
+    let delimiters = matches.get_one::<OsString>(options::DELIMITER).unwrap();
     let files = matches
         .get_many::<OsString>(options::FILE)
         .unwrap()
@@ -61,7 +62,8 @@ pub fn uu_app() -> Command {
                 .help(translate!("paste-help-delimiter"))
                 .value_name("LIST")
                 .default_value("\t")
-                .hide_default_value(true),
+                .hide_default_value(true)
+                .value_parser(clap::value_parser!(OsString)),
         )
         .arg(
             Arg::new(options::FILE)
@@ -84,7 +86,7 @@ pub fn uu_app() -> Command {
 fn paste(
     filenames: Vec<OsString>,
     serial: bool,
-    delimiters: &str,
+    delimiters: &OsString,
     line_ending: LineEnding,
 ) -> UResult<()> {
     let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
@@ -185,65 +187,42 @@ fn paste(
     Ok(())
 }
 
-fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
-    /// A single backslash char
-    const BACKSLASH: char = '\\';
-
-    fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
-        vec.push(Box::new([byte]));
-    }
-
-    // a buffer of length four is large enough to encode any char
-    let mut buffer = [0; 4];
-
-    let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
-        let delimiter_encoded = ch.encode_utf8(&mut buffer);
-
-        vec.push(Box::<[u8]>::from(delimiter_encoded.as_bytes()));
-    };
-
-    let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());
-
-    let mut chars = delimiters.chars();
-
-    // Unescape all special characters
-    while let Some(char) = chars.next() {
-        match char {
-            BACKSLASH => match chars.next() {
-                // "Empty string (not a null character)"
-                // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
-                Some('0') => {
-                    vec.push(Box::<[u8; 0]>::new([]));
-                }
-                // "\\" to "\" (U+005C)
-                Some(BACKSLASH) => {
-                    add_one_byte_single_char_delimiter(&mut vec, b'\\');
-                }
-                // "\n" to U+000A
-                Some('n') => {
-                    add_one_byte_single_char_delimiter(&mut vec, b'\n');
-                }
-                // "\t" to U+0009
-                Some('t') => {
-                    add_one_byte_single_char_delimiter(&mut vec, b'\t');
-                }
-                Some(other_char) => {
-                    // "If any other characters follow the <backslash>, the results are unspecified."
-                    // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
-                    // However, other implementations remove the backslash
-                    // See "test_posix_unspecified_delimiter"
-                    add_single_char_delimiter(&mut vec, other_char);
-                }
-                None => {
-                    return Err(USimpleError::new(
-                        1,
-                        translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters),
-                    ));
+fn parse_delimiters(delimiters: &OsString) -> UResult<Box<[Box<[u8]>]>> {
+    let bytes = uucore::os_string_to_vec(delimiters.clone())?;
+    let mut vec = Vec::<Box<[u8]>>::with_capacity(bytes.len());
+    let mut i = 0;
+
+    while i < bytes.len() {
+        if bytes[i] == b'\\' {
+            i += 1;
+            if i >= bytes.len() {
+                return Err(USimpleError::new(
+                    1,
+                    translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()),
+                ));
+            }
+            match bytes[i] {
+                b'0' => vec.push(Box::new([])),
+                b'\\' => vec.push(Box::new([b'\\'])),
+                b'n' => vec.push(Box::new([b'\n'])),
+                b't' => vec.push(Box::new([b'\t'])),
+                b'b' => vec.push(Box::new([b'\x08'])),
+                b'f' => vec.push(Box::new([b'\x0C'])),
+                b'r' => vec.push(Box::new([b'\r'])),
+                b'v' => vec.push(Box::new([b'\x0B'])),
+                _ => {
+                    // Unknown escape: strip backslash, use the following character(s)
+                    let len = mb_char_len(&bytes[i..]);
+                    vec.push(Box::from(&bytes[i..i + len]));
+                    i += len;
+                    continue;
                 }
-            },
-            non_backslash_char => {
-                add_single_char_delimiter(&mut vec, non_backslash_char);
             }
+            i += 1;
+        } else {
+            let len = mb_char_len(&bytes[i..]);
+            vec.push(Box::from(&bytes[i..i + len]));
+            i += len;
         }
     }
 
diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml
@@ -150,7 +150,8 @@ format = [
   "quoting-style",
   "unit-prefix",
 ]
-i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"]
+i18n-all = ["i18n-charmap", "i18n-collator", "i18n-decimal", "i18n-datetime"]
+i18n-charmap = ["i18n-common"]
 i18n-common = ["icu_locale"]
 i18n-collator = ["i18n-common", "icu_collator"]
 i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"]
diff --git a/src/uucore/src/lib/features/i18n/charmap.rs b/src/uucore/src/lib/features/i18n/charmap.rs
@@ -0,0 +1,143 @@
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+// spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata
+
+//! Locale-aware multi-byte character length detection via `LC_CTYPE`.
+
+use std::sync::OnceLock;
+
+enum MbEncoding {
+    Utf8,
+    Gb18030,
+    EucJp,
+    EucKr,
+    Big5,
+}
+
+fn encoding_from_name(enc: &str) -> MbEncoding {
+    match enc {
+        "gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
+        "euc-jp" | "eucjp" => MbEncoding::EucJp,
+        "euc-kr" | "euckr" => MbEncoding::EucKr,
+        "big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
+        _ => MbEncoding::Utf8,
+    }
+}
+
+fn get_encoding() -> &'static MbEncoding {
+    static ENCODING: OnceLock<MbEncoding> = OnceLock::new();
+    ENCODING.get_or_init(|| {
+        let val = ["LC_ALL", "LC_CTYPE", "LANG"]
+            .iter()
+            .find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
+        let s = match val.as_deref() {
+            Some(s) if s != "C" && s != "POSIX" => s,
+            _ => return MbEncoding::Utf8,
+        };
+        if let Some(enc) = s.split('.').nth(1) {
+            let enc = enc.split('@').next().unwrap_or(enc);
+            encoding_from_name(&enc.to_ascii_lowercase())
+        } else {
+            // Bare locale defaults from glibc localedata/SUPPORTED
+            match s.split('@').next().unwrap_or(s) {
+                "zh_CN" | "zh_SG" => MbEncoding::Gb18030,
+                "zh_TW" | "zh_HK" => MbEncoding::Big5,
+                _ => MbEncoding::Utf8,
+            }
+        }
+    })
+}
+
+/// Byte length of the first character in `bytes` under the current locale encoding.
+/// Returns 1 for empty, invalid, or incomplete sequences.
+pub fn mb_char_len(bytes: &[u8]) -> usize {
+    if bytes.is_empty() {
+        return 1;
+    }
+    let b0 = bytes[0];
+    if b0 <= 0x7F {
+        return 1;
+    }
+    match get_encoding() {
+        MbEncoding::Utf8 => utf8_len(bytes, b0),
+        MbEncoding::Gb18030 => gb18030_len(bytes, b0),
+        MbEncoding::EucJp => eucjp_len(bytes, b0),
+        MbEncoding::EucKr => euckr_len(bytes, b0),
+        MbEncoding::Big5 => big5_len(bytes, b0),
+    }
+}
+
+// All helpers below assume b0 > 0x7F (ASCII already handled by caller).
+
+fn utf8_len(b: &[u8], b0: u8) -> usize {
+    let n = match b0 {
+        0xC2..=0xDF => 2,
+        0xE0..=0xEF => 3,
+        0xF0..=0xF4 => 4,
+        _ => return 1,
+    };
+    if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) {
+        n
+    } else {
+        1
+    }
+}
+
+// 2-byte: [81-FE][40-7E,80-FE]  4-byte: [81-FE][30-39][81-FE][30-39]
+fn gb18030_len(b: &[u8], b0: u8) -> usize {
+    if !(0x81..=0xFE).contains(&b0) {
+        return 1;
+    }
+    if b.len() >= 4
+        && (0x30..=0x39).contains(&b[1])
+        && (0x81..=0xFE).contains(&b[2])
+        && (0x30..=0x39).contains(&b[3])
+    {
+        return 4;
+    }
+    if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) {
+        return 2;
+    }
+    1
+}
+
+// 3-byte: [8F][A1-FE][A1-FE]  2-byte: [8E][A1-DF] or [A1-FE][A1-FE]
+fn eucjp_len(b: &[u8], b0: u8) -> usize {
+    if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2])
+    {
+        return 3;
+    }
+    if b.len() >= 2 {
+        if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) {
+            return 2;
+        }
+        if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) {
+            return 2;
+        }
+    }
+    1
+}
+
+// 2-byte: [A1-FE][A1-FE]
+fn euckr_len(b: &[u8], b0: u8) -> usize {
+    if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) {
+        2
+    } else {
+        1
+    }
+}
+
+// 2-byte: [81-FE][40-7E,A1-FE]
+fn big5_len(b: &[u8], b0: u8) -> usize {
+    if (0x81..=0xFE).contains(&b0)
+        && b.len() >= 2
+        && ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1]))
+    {
+        2
+    } else {
+        1
+    }
+}
diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs
@@ -7,6 +7,8 @@ use std::sync::OnceLock;
 
 use icu_locale::{Locale, locale};
 
+#[cfg(feature = "i18n-charmap")]
+pub mod charmap;
 #[cfg(feature = "i18n-collator")]
 pub mod collator;
 #[cfg(feature = "i18n-datetime")]
diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs
@@ -135,6 +135,30 @@ const EXAMPLE_DATA: &[TestData] = &[
         ins: &["1 \na \n", "2\t\nb\t\n"],
         out: "1 |2\t\na |b\t\n",
     },
+    TestData {
+        name: "utf8-2byte-delim",
+        args: &["-d", "\u{00A2}"],
+        ins: &["1\n2\n", "a\nb\n"],
+        out: "1\u{00A2}a\n2\u{00A2}b\n",
+    },
+    TestData {
+        name: "utf8-3byte-delim",
+        args: &["-d", "\u{20AC}"],
+        ins: &["1\n2\n", "a\nb\n"],
+        out: "1\u{20AC}a\n2\u{20AC}b\n",
+    },
+    TestData {
+        name: "utf8-4byte-delim",
+        args: &["-d", "\u{1F600}", "-s"],
+        ins: &["1\n2\n3\n"],
+        out: "1\u{1F600}2\u{1F600}3\n",
+    },
+    TestData {
+        name: "utf8-multi-delim-cycle",
+        args: &["-d", "\u{00A2}\u{20AC}"],
+        ins: &["a\nb\nc\n", "1\n2\n3\n", "x\ny\nz\n"],
+        out: "a\u{00A2}1\u{20AC}x\nb\u{00A2}2\u{20AC}y\nc\u{00A2}3\u{20AC}z\n",
+    },
 ];
 
 #[test]
@@ -334,6 +358,19 @@ fn test_backslash_zero_delimiter() {
     }
 }
 
+#[test]
+fn test_gnu_escape_sequences() {
+    let cases: &[(&str, u8)] = &[(r"\b", 0x08), (r"\f", 0x0C), (r"\r", 0x0D), (r"\v", 0x0B)];
+    for &(esc, byte) in cases {
+        let expected = [b'1', byte, b'2', byte, b'3', b'\n'];
+        new_ucmd!()
+            .args(&["-s", "-d", esc])
+            .pipe_in("1\n2\n3\n")
+            .succeeds()
+            .stdout_only_bytes(expected);
+    }
+}
+
 // As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
 // multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
 #[test]
@@ -378,6 +415,21 @@ fn test_data() {
     }
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_non_utf8_delimiter() {
+    let (at, mut ucmd) = at_and_ucmd!();
+    at.write("f1", "1\n2\n");
+    at.write("f2", "a\nb\n");
+    let delim = std::ffi::OsString::from_vec(vec![0xA2, 0xE3]);
+    ucmd.env("LC_ALL", "zh_CN.gb18030")
+        .arg("-d")
+        .arg(&delim)
+        .args(&["f1", "f2"])
+        .succeeds()
+        .stdout_only_bytes(b"1\xA2\xE3a\n2\xA2\xE3b\n");
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_paste_non_utf8_paths() {
diff --git a/util/build-gnu.sh b/util/build-gnu.sh
@@ -162,6 +162,9 @@ fi
 grep -rl 'path_prepend_' tests/* | xargs -r "${SED}" -i 's| path_prepend_ ./src||'
 # path_prepend_ sets $abs_path_dir_: set it manually instead.
 grep -rl '\$abs_path_dir_' tests/*/*.sh | xargs -r "${SED}" -i "s|\$abs_path_dir_|${UU_BUILD_DIR//\//\\/}|g"
+# Some tests use $abs_top_builddir/src for shebangs: point them to the uutils build dir.
+grep -rl '\$abs_top_builddir/src' tests/*/*.sh tests/*/*.pl | xargs -r "${SED}" -i "s|\$abs_top_builddir/src|${UU_BUILD_DIR//\//\\/}|g"
+grep -rl '\$ENV{abs_top_builddir}/src' tests/*/*.pl | xargs -r "${SED}" -i "s|\$ENV{abs_top_builddir}/src|${UU_BUILD_DIR//\//\\/}|g"
 
 # We can't build runcon and chcon without libselinux. But GNU no longer builds dummies of them. So consider they are SELinux specific.
 sed -i 's/^print_ver_.*/require_selinux_/' tests/runcon/runcon-compute.sh