Skip to content

Commit 95bd071

Browse files
committed
date: preserve non-UTF-8 bytes in format string output
1 parent c2d7a4e commit 95bd071

2 files changed

Lines changed: 79 additions & 17 deletions

File tree

src/uu/date/src/date.rs

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,15 @@ const OPT_REFERENCE: &str = "reference";
5353
const OPT_UNIVERSAL: &str = "universal";
5454
const OPT_UNIVERSAL_2: &str = "utc";
5555

56+
/// Character emitted by `String::from_utf8_lossy` for each ill-formed byte subsequence.
57+
const UNICODE_REPLACEMENT: char = '\u{FFFD}';
58+
5659
/// Settings for this program, parsed from the command line
5760
struct Settings {
5861
utc: bool,
5962
format: Format,
63+
/// Raw format bytes for Custom format, to preserve non-UTF-8 bytes in output
64+
format_raw: Option<Vec<u8>>,
6065
date_source: DateSource,
6166
set_to: Option<Zoned>,
6267
debug: bool,
@@ -318,25 +323,31 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
318323
}
319324
}
320325

326+
let mut format_raw: Option<Vec<u8>> = None;
321327
let format = if let Some(form) = matches.get_one::<OsString>(OPT_FORMAT) {
322-
let form = form.to_string_lossy();
323-
if !form.starts_with('+') {
328+
let raw_bytes = form.as_encoded_bytes();
329+
if raw_bytes.first() != Some(&b'+') {
330+
let form_lossy = form.to_string_lossy();
324331
// if an optional Format String was found but the user has not provided an input date
325332
// GNU prints an invalid date Error
326333
if !matches!(date_source, DateSource::Human(_)) {
327334
return Err(USimpleError::new(
328335
1,
329-
translate!("date-error-invalid-date", "date" => form),
336+
translate!("date-error-invalid-date", "date" => form_lossy),
330337
));
331338
}
332339
// If the user did provide an input date with the --date flag and the Format String is
333340
// not starting with '+' GNU prints the missing '+' error message
334341
return Err(USimpleError::new(
335342
1,
336-
translate!("date-error-format-missing-plus", "arg" => form),
343+
translate!("date-error-format-missing-plus", "arg" => form_lossy),
337344
));
338345
}
339-
let form = form[1..].to_string();
346+
let bytes_after_plus = &raw_bytes[1..];
347+
if std::str::from_utf8(bytes_after_plus).is_err() {
348+
format_raw = Some(bytes_after_plus.to_vec());
349+
}
350+
let form = String::from_utf8_lossy(bytes_after_plus).into_owned();
340351
Format::Custom(form)
341352
} else if let Some(fmt) = matches
342353
.get_many::<String>(OPT_ISO_8601)
@@ -383,6 +394,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
383394
let settings = Settings {
384395
utc,
385396
format,
397+
format_raw,
386398
date_source,
387399
set_to,
388400
debug: debug_mode,
@@ -546,6 +558,26 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
546558
let format_string = make_format_string(&settings);
547559
let mut stdout = BufWriter::new(std::io::stdout().lock());
548560

561+
// Pre-extract non-UTF-8 chunks from the raw format bytes (if any).
562+
// from_utf8_lossy emits one U+FFFD per ill-formed subsequence (WTF-8 spec),
563+
// so we can match them 1:1 when restoring original bytes in the output.
564+
let raw_chunks: Option<Vec<&[u8]>> = settings.format_raw.as_ref().map(|raw| {
565+
let mut chunks = Vec::new();
566+
let mut i = 0;
567+
while i < raw.len() {
568+
match std::str::from_utf8(&raw[i..]) {
569+
Ok(_) => break,
570+
Err(e) => {
571+
i += e.valid_up_to();
572+
let len = e.error_len().unwrap_or(raw.len() - i);
573+
chunks.push(&raw[i..i + len]);
574+
i += len;
575+
}
576+
}
577+
}
578+
chunks
579+
});
580+
549581
// Format all the dates
550582
let config = Config::new().custom(PosixCustom::new()).lenient(true);
551583
for date in dates {
@@ -564,9 +596,34 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
564596
&config,
565597
skip_localization,
566598
) {
567-
Ok(s) => writeln!(stdout, "{s}").map_err(|e| {
568-
USimpleError::new(1, translate!("date-error-write", "error" => e))
569-
})?,
599+
Ok(s) => {
600+
if let Some(ref chunks) = raw_chunks {
601+
// Restore non-UTF-8 bytes that were replaced with
602+
// U+FFFD by the lossy conversion. strftime passes
603+
// U+FFFD through unchanged. Each FFFD in the output
604+
// corresponds to the next ill-formed byte subsequence
605+
// from the original format string.
606+
let mut chunk_iter = chunks.iter();
607+
let mut out = Vec::with_capacity(s.len());
608+
for ch in s.chars() {
609+
if ch == UNICODE_REPLACEMENT {
610+
if let Some(chunk) = chunk_iter.next() {
611+
out.extend_from_slice(chunk);
612+
}
613+
} else {
614+
let mut buf = [0u8; 4];
615+
out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
616+
}
617+
}
618+
out.push(b'\n');
619+
stdout.write_all(&out)
620+
} else {
621+
writeln!(stdout, "{s}")
622+
}
623+
.map_err(|e| {
624+
USimpleError::new(1, translate!("date-error-write", "error" => e))
625+
})?;
626+
}
570627
Err(e) => {
571628
let _ = stdout.flush();
572629
return Err(USimpleError::new(

tests/by-util/test_date.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2349,26 +2349,31 @@ fn test_locale_day_names() {
23492349
}
23502350
}
23512351

2352-
/// Test that non-UTF-8 format bytes don't cause errors.
2353-
/// zh_CN.GB18030's date_fmt contains Chinese characters (年, 月, 日) encoded
2354-
/// in GB18030 which is not valid UTF-8. The format argument must handle this
2355-
/// gracefully via lossy conversion.
2352+
/// Test that non-UTF-8 format bytes are preserved in output (not replaced
2353+
/// with U+FFFD), matching GNU behavior.
23562354
#[test]
23572355
#[cfg(unix)]
2358-
fn test_date_non_utf8_locale_gb18030() {
2356+
fn test_date_non_utf8_format_preserved() {
23592357
use std::ffi::OsStr;
23602358
use std::os::unix::ffi::OsStrExt;
23612359

2360+
// Simple case: \xFF should pass through as-is
2361+
let fmt_bytes: &[u8] = b"+\xff%m";
2362+
new_ucmd!()
2363+
.arg("-d")
2364+
.arg("2025-10-11T13:00")
2365+
.arg(OsStr::from_bytes(fmt_bytes))
2366+
.succeeds()
2367+
.stdout_is_bytes(b"\xff10\n");
2368+
23622369
// GB18030-encoded "年" (0xC4EA) + "%m" + "月" (0xD4C2) + "%d" + "日" (0xC8D5)
2363-
// These bytes are valid GB18030 but not valid UTF-8.
23642370
let fmt_bytes: &[u8] = b"+\xc4\xea%m\xd4\xc2%d\xc8\xd5";
2365-
23662371
new_ucmd!()
2367-
.env("LC_ALL", "zh_CN.GB18030")
23682372
.arg("-d")
23692373
.arg("2025-10-11T13:00")
23702374
.arg(OsStr::from_bytes(fmt_bytes))
2371-
.succeeds();
2375+
.succeeds()
2376+
.stdout_is_bytes(b"\xc4\xea10\xd4\xc211\xc8\xd5\n");
23722377
}
23732378

23742379
#[test]

0 commit comments

Comments
 (0)