Skip to content

Commit 88c9ef0

Browse files
committed
unexpand: use utf8_chunks
1 parent 8aa9d9c commit 88c9ef0

1 file changed

Lines changed: 57 additions & 64 deletions

File tree

src/uu/unexpand/src/unexpand.rs

Lines changed: 57 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ use std::fs::File;
1111
use std::io::{self, BufReader, BufWriter, Read, Stdin, Stdout, Write, stdin, stdout};
1212
use std::num::IntErrorKind;
1313
use std::path::Path;
14-
use std::str::from_utf8;
1514
use thiserror::Error;
1615
use uucore::display::Quotable;
1716
use uucore::error::{FromIo, UError, UResult, USimpleError, set_exit_code};
@@ -396,29 +395,14 @@ enum CharType {
396395
Other,
397396
}
398397

399-
fn next_char_info(utf8: bool, buf: &[u8], byte: usize) -> (CharType, usize, usize) {
398+
fn char_info(c: char) -> (CharType, usize) {
400399
use CharType::{Backspace, Other, Space, Tab};
401-
let b = buf[byte];
402-
if b.is_ascii() {
403-
return match b {
404-
b' ' => (Space, 0, 1),
405-
b'\t' => (Tab, 0, 1),
406-
b'\x08' => (Backspace, 0, 1),
407-
_ => (Other, 1, 1),
408-
};
409-
}
410-
411-
if utf8 {
412-
let nbytes = char::from(b).len_utf8();
413-
// don't overrun the buffer because of invalid UTF-8
414-
if buf
415-
.get(byte..byte + nbytes)
416-
.is_some_and(|s| from_utf8(s).is_ok())
417-
{
418-
return (Other, nbytes, nbytes);
419-
}
400+
match c {
401+
' ' => (Space, 1),
402+
'\t' => (Tab, 0),
403+
'\x08' => (Backspace, 0),
404+
_ => (Other, 1),
420405
}
421-
(Other, 1, 1)
422406
}
423407

424408
// This struct is used to store the current state of printing the input buf.
@@ -506,56 +490,65 @@ fn unexpand_buf(
506490
}
507491
}
508492

509-
while byte < buf.len() {
510-
// when we have a finite number of columns, never convert past the last column
511-
if lastcol > 0 && print_state.col >= lastcol {
512-
write_tabs(output, tab_config, print_state, true)?;
513-
output.write_all(&buf[byte..])?;
514-
print_state.scol = print_state.col;
515-
break;
516-
}
517-
518-
// figure out how big the next char is, if it's UTF-8
519-
let (ctype, cwidth, nbytes) = next_char_info(options.utf8, buf, byte);
493+
for chunk in buf.utf8_chunks() {
494+
for c in chunk.valid().chars() {
495+
// when we have a finite number of columns, never convert past the last column
496+
if lastcol > 0 && print_state.col >= lastcol {
497+
write_tabs(output, tab_config, print_state, true)?;
498+
output.write_all(&buf[byte..])?;
499+
print_state.scol = print_state.col;
500+
break;
501+
}
520502

521-
// now figure out how many columns this char takes up, and maybe print it
522-
let tabs_buffered = print_state.leading || options.aflag;
523-
match ctype {
524-
CharType::Space | CharType::Tab => {
525-
// compute next col, but only write space or tab chars if not buffering
526-
print_state.col += if ctype == CharType::Space {
527-
1
528-
} else {
529-
next_tabstop(tab_config, print_state.col).unwrap_or(1)
530-
};
503+
// figure out how big the next char is, if it's UTF-8
504+
let (ctype, cwidth) = char_info(c);
505+
let nbytes = c.len_utf8();
506+
507+
// now figure out how many columns this char takes up, and maybe print it
508+
let tabs_buffered = print_state.leading || options.aflag;
509+
match ctype {
510+
CharType::Space | CharType::Tab => {
511+
// compute next col, but only write space or tab chars if not buffering
512+
print_state.col += if ctype == CharType::Space {
513+
1
514+
} else {
515+
next_tabstop(tab_config, print_state.col).unwrap_or(1)
516+
};
531517

532-
if !tabs_buffered {
518+
if !tabs_buffered {
519+
output.write_all(&buf[byte..byte + nbytes])?;
520+
print_state.scol = print_state.col; // now printed up to this column
521+
}
522+
}
523+
CharType::Other | CharType::Backspace => {
524+
// always
525+
write_tabs(output, tab_config, print_state, options.aflag)?;
526+
print_state.leading = false; // no longer at the start of a line
527+
print_state.col = if ctype == CharType::Other {
528+
// use computed width
529+
print_state.col + cwidth
530+
} else if print_state.col > 0 {
531+
// Backspace case, but only if col > 0
532+
print_state.col - 1
533+
} else {
534+
0
535+
};
533536
output.write_all(&buf[byte..byte + nbytes])?;
534-
print_state.scol = print_state.col; // now printed up to this column
537+
print_state.scol = print_state.col; // we've now printed up to this column
535538
}
536539
}
537-
CharType::Other | CharType::Backspace => {
538-
// always
539-
write_tabs(output, tab_config, print_state, options.aflag)?;
540-
print_state.leading = false; // no longer at the start of a line
541-
print_state.col = if ctype == CharType::Other {
542-
// use computed width
543-
print_state.col + cwidth
544-
} else if print_state.col > 0 {
545-
// Backspace case, but only if col > 0
546-
print_state.col - 1
547-
} else {
548-
0
549-
};
550-
output.write_all(&buf[byte..byte + nbytes])?;
551-
print_state.scol = print_state.col; // we've now printed up to this column
552-
}
540+
print_state.pctype = ctype; // save the previous type
553541
}
554542

555-
byte += nbytes; // move on to next char
556-
print_state.pctype = ctype; // save the previous type
543+
for &b in chunk.invalid() {
544+
write_tabs(output, tab_config, print_state, options.aflag)?;
545+
print_state.leading = false;
546+
print_state.col += 1;
547+
output.write_all(&[b])?;
548+
print_state.scol = print_state.col;
549+
print_state.pctype = CharType::Other;
550+
}
557551
}
558-
559552
Ok(())
560553
}
561554

0 commit comments

Comments
 (0)