Skip to content

Commit 5ce678b

Browse files
authored
Merge pull request #8264 from sylvestre/fix-fmt
Fix the last two GNU tests for fmt
2 parents 0e69c01 + cc296d8 commit 5ce678b

4 files changed

Lines changed: 96 additions & 10 deletions

File tree

.github/workflows/GnuTests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ jobs:
166166
sudo locale-gen --keep-existing sv_SE
167167
sudo locale-gen --keep-existing sv_SE.UTF-8
168168
sudo locale-gen --keep-existing en_US
169+
sudo locale-gen --keep-existing en_US.UTF-8
169170
sudo locale-gen --keep-existing ru_RU.KOI8-R
170171
171172
sudo update-locale

src/uu/fmt/src/linebreak.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
236236
let mut next_active_breaks = vec![];
237237

238238
let stretch = args.opts.width - args.opts.goal;
239-
let minlength = args.opts.goal.max(stretch + 1) - stretch;
239+
let minlength = if args.opts.goal <= 10 {
240+
1
241+
} else {
242+
args.opts.goal.max(stretch + 1) - stretch
243+
};
240244
let mut new_linebreaks = vec![];
241245
let mut is_sentence_start = false;
242246
let mut least_demerits = 0;
@@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo
384388
const BAD_INFTY: i64 = 10_000_000;
385389
const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
386390
// badness = BAD_MULT * abs(r) ^ 3
387-
const BAD_MULT: f32 = 100.0;
391+
const BAD_MULT: f32 = 200.0;
388392
// DR_MULT is multiplier for delta-R between lines
389393
const DR_MULT: f32 = 600.0;
390394
// DL_MULT is penalty multiplier for short words at end of line
391-
const DL_MULT: f32 = 300.0;
395+
const DL_MULT: f32 = 10.0;
392396

393397
fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
394398
// how much stretch are we using?

src/uu/fmt/src/parasplit.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ fn char_width(c: char) -> usize {
2626
}
2727
}
2828

29+
// GNU fmt has a more restrictive definition of whitespace than Unicode.
30+
// It only considers ASCII whitespace characters (space, tab, newline, etc.)
31+
// and excludes many Unicode whitespace characters like non-breaking spaces.
32+
fn is_fmt_whitespace(c: char) -> bool {
33+
// Only ASCII whitespace characters are considered whitespace in GNU fmt
34+
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
35+
}
36+
2937
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
3038
// NoFormatLines; otherwise, they are FormatLines
3139
#[derive(Debug)]
@@ -109,7 +117,7 @@ impl FileLines<'_> {
109117
for (i, char) in line.char_indices() {
110118
if line[i..].starts_with(pfx) {
111119
return (true, i);
112-
} else if !char.is_whitespace() {
120+
} else if !is_fmt_whitespace(char) {
113121
break;
114122
}
115123
}
@@ -128,7 +136,7 @@ impl FileLines<'_> {
128136
prefix_len = indent_len;
129137
}
130138

131-
if (os >= prefix_end) && !c.is_whitespace() {
139+
if (os >= prefix_end) && !is_fmt_whitespace(c) {
132140
// found first non-whitespace after prefix, this is indent_end
133141
indent_end = os;
134142
break;
@@ -154,7 +162,7 @@ impl Iterator for FileLines<'_> {
154162
// emit a blank line
155163
// Err(true) indicates that this was a linebreak,
156164
// which is important to know when detecting mail headers
157-
if n.chars().all(char::is_whitespace) {
165+
if n.chars().all(is_fmt_whitespace) {
158166
return Some(Line::NoFormatLine(String::new(), true));
159167
}
160168

@@ -174,7 +182,7 @@ impl Iterator for FileLines<'_> {
174182
if pmatch
175183
&& n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
176184
.chars()
177-
.all(char::is_whitespace)
185+
.all(is_fmt_whitespace)
178186
{
179187
return Some(Line::NoFormatLine(n, false));
180188
}
@@ -498,7 +506,7 @@ impl WordSplit<'_> {
498506
let mut aftertab = 0;
499507
let mut word_start = None;
500508
for (os, c) in string.char_indices() {
501-
if !c.is_whitespace() {
509+
if !is_fmt_whitespace(c) {
502510
word_start = Some(os);
503511
break;
504512
} else if c == '\t' {
@@ -519,7 +527,7 @@ impl WordSplit<'_> {
519527
impl WordSplit<'_> {
520528
fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
521529
// wordsplits *must* start at a non-whitespace character
522-
let trim_string = string.trim_start();
530+
let trim_string = string.trim_start_matches(is_fmt_whitespace);
523531
WordSplit {
524532
opts,
525533
string: trim_string,
@@ -571,7 +579,7 @@ impl<'a> Iterator for WordSplit<'a> {
571579
// points to whitespace character OR end of string
572580
let mut word_nchars = 0;
573581
self.position = match self.string[word_start..].find(|x: char| {
574-
if x.is_whitespace() {
582+
if is_fmt_whitespace(x) {
575583
true
576584
} else {
577585
word_nchars += char_width(x);

tests/by-util/test_fmt.rs

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
//
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
5+
6+
// spell-checker:ignore plass samp
7+
58
use uutests::new_ucmd;
69
use uutests::util::TestScenario;
710
use uutests::util_name;
@@ -303,3 +306,73 @@ fn prefix_equal_skip_prefix_equal_two() {
303306
.stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt");
304307
}
305308
}
309+
310+
#[test]
311+
fn test_fmt_unicode_whitespace_handling() {
312+
// Character classification fix: Test that Unicode whitespace characters like non-breaking space
313+
// are NOT treated as whitespace by fmt, maintaining GNU fmt compatibility.
314+
// GNU fmt only recognizes ASCII whitespace (space, tab, newline, etc.) and excludes
315+
// Unicode whitespace characters to ensure consistent formatting behavior.
316+
// This prevents regression of the character classification fix
317+
let non_breaking_space = "\u{00A0}"; // U+00A0 NO-BREAK SPACE
318+
let figure_space = "\u{2007}"; // U+2007 FIGURE SPACE
319+
let narrow_no_break_space = "\u{202F}"; // U+202F NARROW NO-BREAK SPACE
320+
321+
// When fmt splits on width=1, these characters should NOT cause line breaks
322+
// because they should not be considered whitespace
323+
for (name, char) in [
324+
("non-breaking space", non_breaking_space),
325+
("figure space", figure_space),
326+
("narrow no-break space", narrow_no_break_space),
327+
] {
328+
let input = format!("={char}=");
329+
let result = new_ucmd!()
330+
.args(&["-s", "-w1"])
331+
.pipe_in(input.as_bytes())
332+
.succeeds();
333+
334+
// Should be 1 line since the Unicode char is not treated as whitespace
335+
assert_eq!(
336+
result.stdout_str().lines().count(),
337+
1,
338+
"Failed for {name}: Unicode character should not be treated as whitespace"
339+
);
340+
}
341+
}
342+
343+
#[test]
344+
fn test_fmt_knuth_plass_line_breaking() {
345+
// Line breaking algorithm improvements: Test the enhanced Knuth-Plass optimal line breaking
346+
// algorithm that better handles sentence boundaries, word positioning constraints,
347+
// and produces more natural line breaks for complex text formatting.
348+
// This prevents regression of the line breaking algorithm improvements
349+
let input = "@command{fmt} prefers breaking lines at the end of a sentence, and tries to\n\
350+
avoid line breaks after the first word of a sentence or before the last word\n\
351+
of a sentence. A @dfn{sentence break} is defined as either the end of a\n\
352+
paragraph or a word ending in any of @samp{.?!}, followed by two spaces or end\n\
353+
of line, ignoring any intervening parentheses or quotes. Like @TeX{},\n\
354+
@command{fmt} reads entire ''paragraphs'' before choosing line breaks; the\n\
355+
algorithm is a variant of that given by\n\
356+
Donald E. Knuth and Michael F. Plass\n\
357+
in ''Breaking Paragraphs Into Lines'',\n\
358+
@cite{Software---Practice & Experience}\n\
359+
@b{11}, 11 (November 1981), 1119--1184.";
360+
361+
let expected = "@command{fmt} prefers breaking lines at the end of a sentence,\n\
362+
and tries to avoid line breaks after the first word of a sentence\n\
363+
or before the last word of a sentence. A @dfn{sentence break}\n\
364+
is defined as either the end of a paragraph or a word ending\n\
365+
in any of @samp{.?!}, followed by two spaces or end of line,\n\
366+
ignoring any intervening parentheses or quotes. Like @TeX{},\n\
367+
@command{fmt} reads entire ''paragraphs'' before choosing line\n\
368+
breaks; the algorithm is a variant of that given by Donald\n\
369+
E. Knuth and Michael F. Plass in ''Breaking Paragraphs Into\n\
370+
Lines'', @cite{Software---Practice & Experience} @b{11}, 11\n\
371+
(November 1981), 1119--1184.\n";
372+
373+
new_ucmd!()
374+
.args(&["-g", "60", "-w", "72"])
375+
.pipe_in(input)
376+
.succeeds()
377+
.stdout_is(expected);
378+
}

0 commit comments

Comments
 (0)