Skip to content

Commit 04b6f0a

Browse files
authored
Unexpand use buffered reads + tests - improve performance by 58.43% (#10831)
* unexpand: use buffered read and new tests for chunking read edge case behaviour * unexpand: tests use no fixtures + fix edgecase where blanks are divided by chunk bounds * unexpand: shrink chunk read edgecase tests
1 parent f59f86a commit 04b6f0a

2 files changed

Lines changed: 214 additions & 88 deletions

File tree

src/uu/unexpand/src/unexpand.rs

Lines changed: 126 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use clap::{Arg, ArgAction, Command};
99
use std::ffi::OsString;
1010
use std::fs::File;
11-
use std::io::{BufRead, BufReader, BufWriter, Read, Stdout, Write, stdin, stdout};
11+
use std::io::{BufReader, BufWriter, Read, Stdout, Write, stdin, stdout};
1212
use std::num::IntErrorKind;
1313
use std::path::Path;
1414
use std::str::from_utf8;
@@ -347,7 +347,7 @@ fn next_tabstop(tab_config: &TabConfig, col: usize) -> Option<usize> {
347347
fn write_tabs(
348348
output: &mut BufWriter<Stdout>,
349349
tab_config: &TabConfig,
350-
mut scol: usize,
350+
scol: &mut usize,
351351
col: usize,
352352
prevtab: bool,
353353
init: bool,
@@ -357,20 +357,20 @@ fn write_tabs(
357357
// We never turn a single space before a non-blank into
358358
// a tab, unless it's at the start of the line.
359359
let ai = init || amode;
360-
if (ai && !prevtab && col > scol + 1) || (col > scol && (init || ai && prevtab)) {
361-
while let Some(nts) = next_tabstop(tab_config, scol) {
362-
if col < scol + nts {
360+
if (ai && !prevtab && col > *scol + 1) || (col > *scol && (init || ai && prevtab)) {
361+
while let Some(nts) = next_tabstop(tab_config, *scol) {
362+
if col < *scol + nts {
363363
break;
364364
}
365365

366366
output.write_all(b"\t")?;
367-
scol += nts;
367+
*scol += nts;
368368
}
369369
}
370370

371-
while col > scol {
371+
while col > *scol {
372372
output.write_all(b" ")?;
373-
scol += 1;
373+
*scol += 1;
374374
}
375375
Ok(())
376376
}
@@ -424,101 +424,118 @@ fn next_char_info(uflag: bool, buf: &[u8], byte: usize) -> (CharType, usize, usi
424424
}
425425

426426
#[allow(clippy::cognitive_complexity)]
427-
fn unexpand_line(
428-
buf: &mut Vec<u8>,
427+
#[allow(clippy::too_many_arguments)]
428+
fn unexpand_buf(
429+
buf: &[u8],
429430
output: &mut BufWriter<Stdout>,
430431
options: &Options,
431432
lastcol: usize,
432433
tab_config: &TabConfig,
434+
col: &mut usize,
435+
scol: &mut usize,
436+
leading: &mut bool,
437+
pctype: &mut CharType,
433438
) -> UResult<()> {
434-
// Fast path: if we're not converting all spaces (-a flag not set)
435-
// and the line doesn't start with spaces, just write it directly
436-
if !options.aflag && !buf.is_empty() && buf[0] != b' ' && buf[0] != b'\t' {
437-
output.write_all(buf)?;
438-
buf.truncate(0);
439-
return Ok(());
439+
// We can only fast forward if we don't need to calculate col/scol
440+
if let Some(b'\n') = buf.last() {
441+
// Fast path: if we're not converting all spaces (-a flag not set)
442+
// and the line doesn't start with spaces, just write it directly
443+
if !options.aflag && !buf.is_empty() && ((buf[0] != b' ' && buf[0] != b'\t') || !*leading) {
444+
write_tabs(
445+
output,
446+
tab_config,
447+
scol,
448+
*col,
449+
*pctype == CharType::Tab,
450+
*leading,
451+
options.aflag,
452+
)?;
453+
*scol = *col;
454+
*col += buf.len();
455+
output.write_all(buf)?;
456+
return Ok(());
457+
}
440458
}
441459

442460
let mut byte = 0; // offset into the buffer
443-
let mut col = 0; // the current column
444-
let mut scol = 0; // the start col for the current span, i.e., the already-printed width
445-
let mut init = true; // are we at the start of the line?
446-
let mut pctype = CharType::Other;
447461

448-
// Fast path for leading spaces in non-UTF8 mode: count consecutive spaces/tabs at start
449-
if !options.uflag && !options.aflag {
450-
// In default mode (not -a), we only convert leading spaces
451-
// So we can batch process them and then copy the rest
452-
while byte < buf.len() {
453-
match buf[byte] {
454-
b' ' => {
455-
col += 1;
456-
byte += 1;
457-
}
458-
b'\t' => {
459-
col += next_tabstop(tab_config, col).unwrap_or(1);
460-
byte += 1;
461-
pctype = CharType::Tab;
462+
// We can only fast forward if we don't need to calculate col/scol
463+
if let Some(b'\n') = buf.last() {
464+
// Fast path for leading spaces in non-UTF8 mode: count consecutive spaces/tabs at start
465+
if !options.uflag && !options.aflag && *leading {
466+
// In default mode (not -a), we only convert leading spaces
467+
// So we can batch process them and then copy the rest
468+
while byte < buf.len() {
469+
match buf[byte] {
470+
b' ' => {
471+
*col += 1;
472+
byte += 1;
473+
}
474+
b'\t' => {
475+
*col += next_tabstop(tab_config, *col).unwrap_or(1);
476+
byte += 1;
477+
*pctype = CharType::Tab;
478+
}
479+
_ => break,
462480
}
463-
_ => break,
464481
}
465-
}
466482

467-
// If we found spaces/tabs, write them as tabs
468-
if byte > 0 {
469-
write_tabs(
470-
output,
471-
tab_config,
472-
0,
473-
col,
474-
pctype == CharType::Tab,
475-
true,
476-
true,
477-
)?;
478-
}
483+
// If we found spaces/tabs, write them as tabs
484+
if byte > 0 {
485+
write_tabs(
486+
output,
487+
tab_config,
488+
scol,
489+
*col,
490+
*pctype == CharType::Tab,
491+
true,
492+
options.aflag,
493+
)?;
494+
}
479495

480-
// Write the rest of the line directly (no more tab conversion needed)
481-
if byte < buf.len() {
482-
output.write_all(&buf[byte..])?;
496+
// Write the rest of the line directly (no more tab conversion needed)
497+
if byte < buf.len() {
498+
*leading = false;
499+
output.write_all(&buf[byte..])?;
500+
}
501+
return Ok(());
483502
}
484-
buf.truncate(0);
485-
return Ok(());
486503
}
487504

488505
while byte < buf.len() {
489506
// when we have a finite number of columns, never convert past the last column
490-
if lastcol > 0 && col >= lastcol {
507+
if lastcol > 0 && *col >= lastcol {
491508
write_tabs(
492509
output,
493510
tab_config,
494511
scol,
495-
col,
496-
pctype == CharType::Tab,
497-
init,
512+
*col,
513+
*pctype == CharType::Tab,
514+
*leading,
498515
true,
499516
)?;
500517
output.write_all(&buf[byte..])?;
501-
scol = col;
518+
*scol = *col;
502519
break;
503520
}
504521

505522
// figure out how big the next char is, if it's UTF-8
506523
let (ctype, cwidth, nbytes) = next_char_info(options.uflag, buf, byte);
507524

508525
// now figure out how many columns this char takes up, and maybe print it
509-
let tabs_buffered = init || options.aflag;
526+
let tabs_buffered = *leading || options.aflag;
510527
match ctype {
511528
CharType::Space | CharType::Tab => {
512529
// compute next col, but only write space or tab chars if not buffering
513-
col += if ctype == CharType::Space {
530+
*col += if ctype == CharType::Space {
514531
1
515532
} else {
516-
next_tabstop(tab_config, col).unwrap_or(1)
533+
next_tabstop(tab_config, *col).unwrap_or(1)
517534
};
518535

519536
if !tabs_buffered {
520537
output.write_all(&buf[byte..byte + nbytes])?;
521-
scol = col; // now printed up to this column
538+
*scol = *col; // now printed up to this column
522539
}
523540
}
524541
CharType::Other | CharType::Backspace => {
@@ -527,42 +544,30 @@ fn unexpand_line(
527544
output,
528545
tab_config,
529546
scol,
530-
col,
531-
pctype == CharType::Tab,
532-
init,
547+
*col,
548+
*pctype == CharType::Tab,
549+
*leading,
533550
options.aflag,
534551
)?;
535-
init = false; // no longer at the start of a line
536-
col = if ctype == CharType::Other {
552+
*leading = false; // no longer at the start of a line
553+
*col = if ctype == CharType::Other {
537554
// use computed width
538-
col + cwidth
539-
} else if col > 0 {
555+
*col + cwidth
556+
} else if *col > 0 {
540557
// Backspace case, but only if col > 0
541-
col - 1
558+
*col - 1
542559
} else {
543560
0
544561
};
545562
output.write_all(&buf[byte..byte + nbytes])?;
546-
scol = col; // we've now printed up to this column
563+
*scol = *col; // we've now printed up to this column
547564
}
548565
}
549566

550567
byte += nbytes; // move on to next char
551-
pctype = ctype; // save the previous type
568+
*pctype = ctype; // save the previous type
552569
}
553570

554-
// write out anything remaining
555-
write_tabs(
556-
output,
557-
tab_config,
558-
scol,
559-
col,
560-
pctype == CharType::Tab,
561-
init,
562-
true,
563-
)?;
564-
buf.truncate(0); // clear out the buffer
565-
566571
Ok(())
567572
}
568573

@@ -573,15 +578,49 @@ fn unexpand_file(
573578
lastcol: usize,
574579
tab_config: &TabConfig,
575580
) -> UResult<()> {
576-
let mut buf = Vec::new();
581+
let mut buf = [0u8; 4096];
577582
let mut input = open(file)?;
583+
let mut col = 0;
584+
let mut scol = 0;
585+
let mut leading = true;
586+
let mut pctype = CharType::Other;
578587
loop {
579-
match input.read_until(b'\n', &mut buf) {
588+
match input.read(&mut buf) {
580589
Ok(0) => break,
581-
Ok(_) => unexpand_line(&mut buf, output, options, lastcol, tab_config)?,
590+
Ok(n) => {
591+
for line in buf[..n].split_inclusive(|b| *b == b'\n') {
592+
unexpand_buf(
593+
line,
594+
output,
595+
options,
596+
lastcol,
597+
tab_config,
598+
&mut col,
599+
&mut scol,
600+
&mut leading,
601+
&mut pctype,
602+
)?;
603+
if let Some(b'\n') = line.last() {
604+
col = 0;
605+
scol = 0;
606+
leading = true;
607+
pctype = CharType::Other;
608+
}
609+
}
610+
}
582611
Err(e) => return Err(e.map_err_context(|| file.maybe_quote().to_string())),
583612
}
584613
}
614+
// write out anything remaining
615+
write_tabs(
616+
output,
617+
tab_config,
618+
&mut scol,
619+
col,
620+
pctype == CharType::Tab,
621+
leading,
622+
options.aflag,
623+
)?;
585624
Ok(())
586625
}
587626

0 commit comments

Comments
 (0)