Skip to content

Commit 4bbd71f

Browse files
authored
unexpand: use buffered read & improve performance by 34.66% (#10798)
1 parent 4853729 commit 4bbd71f

22 files changed

Lines changed: 283 additions & 73 deletions

src/uu/unexpand/src/unexpand.rs

Lines changed: 100 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use clap::{Arg, ArgAction, Command};
99
use std::ffi::OsString;
1010
use std::fs::File;
11-
use std::io::{BufRead, BufReader, BufWriter, Read, Stdout, Write, stdin, stdout};
11+
use std::io::{BufReader, BufWriter, Read, Stdout, Write, stdin, stdout};
1212
use std::num::IntErrorKind;
1313
use std::path::Path;
1414
use std::str::from_utf8;
@@ -347,7 +347,7 @@ fn next_tabstop(tab_config: &TabConfig, col: usize) -> Option<usize> {
347347
fn write_tabs(
348348
output: &mut BufWriter<Stdout>,
349349
tab_config: &TabConfig,
350-
mut scol: usize,
350+
scol: &mut usize,
351351
col: usize,
352352
prevtab: bool,
353353
init: bool,
@@ -357,20 +357,20 @@ fn write_tabs(
357357
// We never turn a single space before a non-blank into
358358
// a tab, unless it's at the start of the line.
359359
let ai = init || amode;
360-
if (ai && !prevtab && col > scol + 1) || (col > scol && (init || ai && prevtab)) {
361-
while let Some(nts) = next_tabstop(tab_config, scol) {
362-
if col < scol + nts {
360+
if (ai && !prevtab && col > *scol + 1) || (col > *scol && (init || ai && prevtab)) {
361+
while let Some(nts) = next_tabstop(tab_config, *scol) {
362+
if col < *scol + nts {
363363
break;
364364
}
365365

366366
output.write_all(b"\t")?;
367-
scol += nts;
367+
*scol += nts;
368368
}
369369
}
370370

371-
while col > scol {
371+
while col > *scol {
372372
output.write_all(b" ")?;
373-
scol += 1;
373+
*scol += 1;
374374
}
375375
Ok(())
376376
}
@@ -424,101 +424,108 @@ fn next_char_info(uflag: bool, buf: &[u8], byte: usize) -> (CharType, usize, usi
424424
}
425425

426426
#[allow(clippy::cognitive_complexity)]
427+
#[allow(clippy::too_many_arguments)]
427428
fn unexpand_line(
428-
buf: &mut Vec<u8>,
429+
buf: &[u8],
429430
output: &mut BufWriter<Stdout>,
430431
options: &Options,
431432
lastcol: usize,
432433
tab_config: &TabConfig,
434+
col: &mut usize,
435+
scol: &mut usize,
436+
leading: &mut bool,
433437
) -> UResult<()> {
434-
// Fast path: if we're not converting all spaces (-a flag not set)
435-
// and the line doesn't start with spaces, just write it directly
436-
if !options.aflag && !buf.is_empty() && buf[0] != b' ' && buf[0] != b'\t' {
437-
output.write_all(buf)?;
438-
buf.truncate(0);
439-
return Ok(());
438+
// We can only fast forward if we don't need to calculate col/scol
439+
if let Some(b'\n') = buf.last() {
440+
// Fast path: if we're not converting all spaces (-a flag not set)
441+
// and the line doesn't start with spaces, just write it directly
442+
if !options.aflag && !buf.is_empty() && ((buf[0] != b' ' && buf[0] != b'\t') || !*leading) {
443+
*col += buf.len();
444+
output.write_all(buf)?;
445+
return Ok(());
446+
}
440447
}
441448

442449
let mut byte = 0; // offset into the buffer
443-
let mut col = 0; // the current column
444-
let mut scol = 0; // the start col for the current span, i.e., the already-printed width
445-
let mut init = true; // are we at the start of the line?
446450
let mut pctype = CharType::Other;
447451

448-
// Fast path for leading spaces in non-UTF8 mode: count consecutive spaces/tabs at start
449-
if !options.uflag && !options.aflag {
450-
// In default mode (not -a), we only convert leading spaces
451-
// So we can batch process them and then copy the rest
452-
while byte < buf.len() {
453-
match buf[byte] {
454-
b' ' => {
455-
col += 1;
456-
byte += 1;
457-
}
458-
b'\t' => {
459-
col += next_tabstop(tab_config, col).unwrap_or(1);
460-
byte += 1;
461-
pctype = CharType::Tab;
452+
// We can only fast forward if we don't need to calculate col/scol
453+
if let Some(b'\n') = buf.last() {
454+
// Fast path for leading spaces in non-UTF8 mode: count consecutive spaces/tabs at start
455+
if !options.uflag && !options.aflag && *leading {
456+
// In default mode (not -a), we only convert leading spaces
457+
// So we can batch process them and then copy the rest
458+
while byte < buf.len() {
459+
match buf[byte] {
460+
b' ' => {
461+
*col += 1;
462+
byte += 1;
463+
}
464+
b'\t' => {
465+
*col += next_tabstop(tab_config, *col).unwrap_or(1);
466+
byte += 1;
467+
pctype = CharType::Tab;
468+
}
469+
_ => break,
462470
}
463-
_ => break,
464471
}
465-
}
466472

467-
// If we found spaces/tabs, write them as tabs
468-
if byte > 0 {
469-
write_tabs(
470-
output,
471-
tab_config,
472-
0,
473-
col,
474-
pctype == CharType::Tab,
475-
true,
476-
true,
477-
)?;
478-
}
473+
// If we found spaces/tabs, write them as tabs
474+
if byte > 0 {
475+
write_tabs(
476+
output,
477+
tab_config,
478+
scol,
479+
*col,
480+
pctype == CharType::Tab,
481+
true,
482+
true,
483+
)?;
484+
}
479485

480-
// Write the rest of the line directly (no more tab conversion needed)
481-
if byte < buf.len() {
482-
output.write_all(&buf[byte..])?;
486+
// Write the rest of the line directly (no more tab conversion needed)
487+
if byte < buf.len() {
488+
*leading = false;
489+
output.write_all(&buf[byte..])?;
490+
}
491+
return Ok(());
483492
}
484-
buf.truncate(0);
485-
return Ok(());
486493
}
487494

488495
while byte < buf.len() {
489496
// when we have a finite number of columns, never convert past the last column
490-
if lastcol > 0 && col >= lastcol {
497+
if lastcol > 0 && *col >= lastcol {
491498
write_tabs(
492499
output,
493500
tab_config,
494501
scol,
495-
col,
502+
*col,
496503
pctype == CharType::Tab,
497-
init,
504+
*leading,
498505
true,
499506
)?;
500507
output.write_all(&buf[byte..])?;
501-
scol = col;
508+
*scol = *col;
502509
break;
503510
}
504511

505512
// figure out how big the next char is, if it's UTF-8
506513
let (ctype, cwidth, nbytes) = next_char_info(options.uflag, buf, byte);
507514

508515
// now figure out how many columns this char takes up, and maybe print it
509-
let tabs_buffered = init || options.aflag;
516+
let tabs_buffered = *leading || options.aflag;
510517
match ctype {
511518
CharType::Space | CharType::Tab => {
512519
// compute next col, but only write space or tab chars if not buffering
513-
col += if ctype == CharType::Space {
520+
*col += if ctype == CharType::Space {
514521
1
515522
} else {
516-
next_tabstop(tab_config, col).unwrap_or(1)
523+
next_tabstop(tab_config, *col).unwrap_or(1)
517524
};
518525

519526
if !tabs_buffered {
520527
output.write_all(&buf[byte..byte + nbytes])?;
521-
scol = col; // now printed up to this column
528+
*scol = *col; // now printed up to this column
522529
}
523530
}
524531
CharType::Other | CharType::Backspace => {
@@ -527,23 +534,23 @@ fn unexpand_line(
527534
output,
528535
tab_config,
529536
scol,
530-
col,
537+
*col,
531538
pctype == CharType::Tab,
532-
init,
539+
*leading,
533540
options.aflag,
534541
)?;
535-
init = false; // no longer at the start of a line
536-
col = if ctype == CharType::Other {
542+
*leading = false; // no longer at the start of a line
543+
*col = if ctype == CharType::Other {
537544
// use computed width
538-
col + cwidth
539-
} else if col > 0 {
545+
*col + cwidth
546+
} else if *col > 0 {
540547
// Backspace case, but only if col > 0
541-
col - 1
548+
*col - 1
542549
} else {
543550
0
544551
};
545552
output.write_all(&buf[byte..byte + nbytes])?;
546-
scol = col; // we've now printed up to this column
553+
*scol = *col; // we've now printed up to this column
547554
}
548555
}
549556

@@ -556,12 +563,11 @@ fn unexpand_line(
556563
output,
557564
tab_config,
558565
scol,
559-
col,
566+
*col,
560567
pctype == CharType::Tab,
561-
init,
568+
*leading,
562569
true,
563570
)?;
564-
buf.truncate(0); // clear out the buffer
565571

566572
Ok(())
567573
}
@@ -573,12 +579,33 @@ fn unexpand_file(
573579
lastcol: usize,
574580
tab_config: &TabConfig,
575581
) -> UResult<()> {
576-
let mut buf = Vec::new();
582+
let mut buf = [0u8; 4096];
577583
let mut input = open(file)?;
584+
let mut col = 0;
585+
let mut scol = 0;
586+
let mut leading = true;
578587
loop {
579-
match input.read_until(b'\n', &mut buf) {
588+
match input.read(&mut buf) {
580589
Ok(0) => break,
581-
Ok(_) => unexpand_line(&mut buf, output, options, lastcol, tab_config)?,
590+
Ok(n) => {
591+
for line in buf[..n].split_inclusive(|b| *b == b'\n') {
592+
unexpand_line(
593+
line,
594+
output,
595+
options,
596+
lastcol,
597+
tab_config,
598+
&mut col,
599+
&mut scol,
600+
&mut leading,
601+
)?;
602+
if let Some(b'\n') = line.last() {
603+
col = 0;
604+
scol = 0;
605+
leading = true;
606+
}
607+
}
608+
}
582609
Err(e) => return Err(e.map_err_context(|| file.maybe_quote().to_string())),
583610
}
584611
}

0 commit comments

Comments
 (0)