33// For the full copyright and license information, please view the LICENSE
44// file that was distributed with this source code.
55
6- // spell-checker:ignore (ToDO) ctype cwidth iflag nbytes nspaces nums tspaces uflag Preprocess
6+ // spell-checker:ignore (ToDO) ctype cwidth iflag nbytes nspaces nums tspaces Preprocess
77
88use clap:: { Arg , ArgAction , ArgMatches , Command } ;
99use std:: ffi:: OsString ;
@@ -174,7 +174,7 @@ struct Options {
174174 tabstops : Vec < usize > ,
175175 tspaces : String ,
176176 iflag : bool ,
177- uflag : bool ,
177+ utf8 : bool ,
178178
179179 /// Strategy for expanding tabs for columns beyond those specified
180180 /// in `tabstops`.
@@ -189,7 +189,7 @@ impl Options {
189189 } ;
190190
191191 let iflag = matches. get_flag ( options:: INITIAL ) ;
192- let uflag = !matches. get_flag ( options:: NO_UTF8 ) ;
192+ let utf8 = !matches. get_flag ( options:: NO_UTF8 ) ;
193193
194194 // avoid allocations when dumping out long sequences of spaces
195195 // by precomputing the longest string of spaces we will ever need
@@ -214,7 +214,7 @@ impl Options {
214214 tabstops,
215215 tspaces,
216216 iflag,
217- uflag ,
217+ utf8 ,
218218 remaining_mode,
219219 } )
220220 }
@@ -349,7 +349,62 @@ enum CharType {
349349 Other ,
350350}
351351
352- #[ allow( clippy:: cognitive_complexity) ]
352+ /// Classify a character and determine its width and byte length.
353+ ///
354+ /// Returns `(CharType, display_width, byte_length)`.
355+ #[ inline]
356+ fn classify_char ( buf : & [ u8 ] , byte : usize , utf8 : bool ) -> ( CharType , usize , usize ) {
357+ use self :: CharType :: { Backspace , Other , Tab } ;
358+
359+ if utf8 {
360+ let nbytes = char:: from ( buf[ byte] ) . len_utf8 ( ) ;
361+
362+ if byte + nbytes > buf. len ( ) {
363+ // don't overrun buffer because of invalid UTF-8
364+ return ( Other , 1 , 1 ) ;
365+ }
366+
367+ if let Ok ( t) = from_utf8 ( & buf[ byte..byte + nbytes] ) {
368+ match t. chars ( ) . next ( ) {
369+ Some ( '\t' ) => ( Tab , 0 , 1 ) ,
370+ Some ( '\x08' ) => ( Backspace , 0 , 1 ) ,
371+ Some ( c) => ( Other , UnicodeWidthChar :: width ( c) . unwrap_or ( 0 ) , nbytes) ,
372+ None => {
373+ // no valid char at start of t, so take 1 byte
374+ ( Other , 1 , 1 )
375+ }
376+ }
377+ } else {
378+ ( Other , 1 , 1 ) // implicit assumption: non-UTF-8 char is 1 col wide
379+ }
380+ } else {
381+ (
382+ match buf. get ( byte) {
383+ // always take exactly 1 byte in strict ASCII mode
384+ Some ( 0x09 ) => Tab ,
385+ Some ( 0x08 ) => Backspace ,
386+ _ => Other ,
387+ } ,
388+ 0 ,
389+ 1 ,
390+ )
391+ }
392+ }
393+
394+ /// Write spaces for a tab expansion.
395+ #[ inline]
396+ fn write_tab_spaces (
397+ output : & mut BufWriter < std:: io:: Stdout > ,
398+ nts : usize ,
399+ tspaces : & str ,
400+ ) -> std:: io:: Result < ( ) > {
401+ if nts <= tspaces. len ( ) {
402+ output. write_all ( & tspaces. as_bytes ( ) [ ..nts] )
403+ } else {
404+ output. write_all ( " " . repeat ( nts) . as_bytes ( ) )
405+ }
406+ }
407+
353408fn expand_line (
354409 buf : & mut Vec < u8 > ,
355410 output : & mut BufWriter < std:: io:: Stdout > ,
@@ -360,8 +415,7 @@ fn expand_line(
360415
361416 // Fast path: if there are no tabs, backspaces, and (in UTF-8 mode or no carriage returns),
362417 // we can write the buffer directly without character-by-character processing
363- if !buf. contains ( & b'\t' ) && !buf. contains ( & b'\x08' ) && ( options. uflag || !buf. contains ( & b'\r' ) )
364- {
418+ if !buf. contains ( & b'\t' ) && !buf. contains ( & b'\x08' ) && ( options. utf8 || !buf. contains ( & b'\r' ) ) {
365419 output. write_all ( buf) ?;
366420 buf. truncate ( 0 ) ;
367421 return Ok ( ( ) ) ;
@@ -372,37 +426,7 @@ fn expand_line(
372426 let mut init = true ;
373427
374428 while byte < buf. len ( ) {
375- let ( ctype, cwidth, nbytes) = if options. uflag {
376- let nbytes = char:: from ( buf[ byte] ) . len_utf8 ( ) ;
377-
378- if byte + nbytes > buf. len ( ) {
379- // don't overrun buffer because of invalid UTF-8
380- ( Other , 1 , 1 )
381- } else if let Ok ( t) = from_utf8 ( & buf[ byte..byte + nbytes] ) {
382- match t. chars ( ) . next ( ) {
383- Some ( '\t' ) => ( Tab , 0 , nbytes) ,
384- Some ( '\x08' ) => ( Backspace , 0 , nbytes) ,
385- Some ( c) => ( Other , UnicodeWidthChar :: width ( c) . unwrap_or ( 0 ) , nbytes) ,
386- None => {
387- // no valid char at start of t, so take 1 byte
388- ( Other , 1 , 1 )
389- }
390- }
391- } else {
392- ( Other , 1 , 1 ) // implicit assumption: non-UTF-8 char is 1 col wide
393- }
394- } else {
395- (
396- match buf. get ( byte) {
397- // always take exactly 1 byte in strict ASCII mode
398- Some ( 0x09 ) => Tab ,
399- Some ( 0x08 ) => Backspace ,
400- _ => Other ,
401- } ,
402- 1 ,
403- 1 ,
404- )
405- } ;
429+ let ( ctype, cwidth, nbytes) = classify_char ( buf, byte, options. utf8 ) ;
406430
407431 // figure out how many columns this char takes up
408432 match ctype {
@@ -413,23 +437,24 @@ fn expand_line(
413437
414438 // now dump out either spaces if we're expanding, or a literal tab if we're not
415439 if init || !options. iflag {
416- if nts <= options. tspaces . len ( ) {
417- output. write_all ( & options. tspaces . as_bytes ( ) [ ..nts] ) ?;
418- } else {
419- output. write_all ( " " . repeat ( nts) . as_bytes ( ) ) ?;
420- }
440+ write_tab_spaces ( output, nts, & options. tspaces ) ?;
421441 } else {
422442 output. write_all ( & buf[ byte..byte + nbytes] ) ?;
423443 }
424444 }
425- _ => {
426- col = if ctype == Other {
427- col + cwidth
428- } else if col > 0 {
429- col - 1
430- } else {
431- 0
432- } ;
445+ Backspace => {
446+ col = col. saturating_sub ( 1 ) ;
447+
448+ // if we're writing anything other than a space, then we're
449+ // done with the line's leading spaces
450+ if buf[ byte] != 0x20 {
451+ init = false ;
452+ }
453+
454+ output. write_all ( & buf[ byte..byte + nbytes] ) ?;
455+ }
456+ Other => {
457+ col += cwidth;
433458
434459 // if we're writing anything other than a space, then we're
435460 // done with the line's leading spaces
0 commit comments