@@ -53,10 +53,15 @@ const OPT_REFERENCE: &str = "reference";
5353const OPT_UNIVERSAL : & str = "universal" ;
5454const OPT_UNIVERSAL_2 : & str = "utc" ;
5555
56+ /// Character emitted by `String::from_utf8_lossy` for each ill-formed byte subsequence.
57+ const UNICODE_REPLACEMENT : char = '\u{FFFD}' ;
58+
5659/// Settings for this program, parsed from the command line
5760struct Settings {
5861 utc : bool ,
5962 format : Format ,
63+ /// Raw format bytes for Custom format, to preserve non-UTF-8 bytes in output
64+ format_raw : Option < Vec < u8 > > ,
6065 date_source : DateSource ,
6166 set_to : Option < Zoned > ,
6267 debug : bool ,
@@ -318,25 +323,31 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
318323 }
319324 }
320325
326+ let mut format_raw: Option < Vec < u8 > > = None ;
321327 let format = if let Some ( form) = matches. get_one :: < OsString > ( OPT_FORMAT ) {
322- let form = form. to_string_lossy ( ) ;
323- if !form. starts_with ( '+' ) {
328+ let raw_bytes = form. as_encoded_bytes ( ) ;
329+ if raw_bytes. first ( ) != Some ( & b'+' ) {
330+ let form_lossy = form. to_string_lossy ( ) ;
324331 // if an optional Format String was found but the user has not provided an input date
325332 // GNU prints an invalid date Error
326333 if !matches ! ( date_source, DateSource :: Human ( _) ) {
327334 return Err ( USimpleError :: new (
328335 1 ,
329- translate ! ( "date-error-invalid-date" , "date" => form ) ,
336+ translate ! ( "date-error-invalid-date" , "date" => form_lossy ) ,
330337 ) ) ;
331338 }
332339 // If the user did provide an input date with the --date flag and the Format String is
333340 // not starting with '+' GNU prints the missing '+' error message
334341 return Err ( USimpleError :: new (
335342 1 ,
336- translate ! ( "date-error-format-missing-plus" , "arg" => form ) ,
343+ translate ! ( "date-error-format-missing-plus" , "arg" => form_lossy ) ,
337344 ) ) ;
338345 }
339- let form = form[ 1 ..] . to_string ( ) ;
346+ let bytes_after_plus = & raw_bytes[ 1 ..] ;
347+ if std:: str:: from_utf8 ( bytes_after_plus) . is_err ( ) {
348+ format_raw = Some ( bytes_after_plus. to_vec ( ) ) ;
349+ }
350+ let form = String :: from_utf8_lossy ( bytes_after_plus) . into_owned ( ) ;
340351 Format :: Custom ( form)
341352 } else if let Some ( fmt) = matches
342353 . get_many :: < String > ( OPT_ISO_8601 )
@@ -383,6 +394,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
383394 let settings = Settings {
384395 utc,
385396 format,
397+ format_raw,
386398 date_source,
387399 set_to,
388400 debug : debug_mode,
@@ -546,6 +558,26 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
546558 let format_string = make_format_string ( & settings) ;
547559 let mut stdout = BufWriter :: new ( std:: io:: stdout ( ) . lock ( ) ) ;
548560
561+ // Pre-extract non-UTF-8 chunks from the raw format bytes (if any).
562+ // from_utf8_lossy emits one U+FFFD per ill-formed subsequence (WTF-8 spec),
563+ // so we can match them 1:1 when restoring original bytes in the output.
564+ let raw_chunks: Option < Vec < & [ u8 ] > > = settings. format_raw . as_ref ( ) . map ( |raw| {
565+ let mut chunks = Vec :: new ( ) ;
566+ let mut i = 0 ;
567+ while i < raw. len ( ) {
568+ match std:: str:: from_utf8 ( & raw [ i..] ) {
569+ Ok ( _) => break ,
570+ Err ( e) => {
571+ i += e. valid_up_to ( ) ;
572+ let len = e. error_len ( ) . unwrap_or ( raw. len ( ) - i) ;
573+ chunks. push ( & raw [ i..i + len] ) ;
574+ i += len;
575+ }
576+ }
577+ }
578+ chunks
579+ } ) ;
580+
549581 // Format all the dates
550582 let config = Config :: new ( ) . custom ( PosixCustom :: new ( ) ) . lenient ( true ) ;
551583 for date in dates {
@@ -564,9 +596,34 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
564596 & config,
565597 skip_localization,
566598 ) {
567- Ok ( s) => writeln ! ( stdout, "{s}" ) . map_err ( |e| {
568- USimpleError :: new ( 1 , translate ! ( "date-error-write" , "error" => e) )
569- } ) ?,
599+ Ok ( s) => {
600+ if let Some ( ref chunks) = raw_chunks {
601+ // Restore non-UTF-8 bytes that were replaced with
602+ // U+FFFD by the lossy conversion. strftime passes
603+ // U+FFFD through unchanged. Each FFFD in the output
604+ // corresponds to the next ill-formed byte subsequence
605+ // from the original format string.
606+ let mut chunk_iter = chunks. iter ( ) ;
607+ let mut out = Vec :: with_capacity ( s. len ( ) ) ;
608+ for ch in s. chars ( ) {
609+ if ch == UNICODE_REPLACEMENT {
610+ if let Some ( chunk) = chunk_iter. next ( ) {
611+ out. extend_from_slice ( chunk) ;
612+ }
613+ } else {
614+ let mut buf = [ 0u8 ; 4 ] ;
615+ out. extend_from_slice ( ch. encode_utf8 ( & mut buf) . as_bytes ( ) ) ;
616+ }
617+ }
618+ out. push ( b'\n' ) ;
619+ stdout. write_all ( & out)
620+ } else {
621+ writeln ! ( stdout, "{s}" )
622+ }
623+ . map_err ( |e| {
624+ USimpleError :: new ( 1 , translate ! ( "date-error-write" , "error" => e) )
625+ } ) ?;
626+ }
570627 Err ( e) => {
571628 let _ = stdout. flush ( ) ;
572629 return Err ( USimpleError :: new (
0 commit comments