Merge pull request #8264 from sylvestre/fix-fmt

cakebaker · web-flow · commit 5ce678b1cd11 · 2025-06-30T14:45:12.000+02:00
Fix the last two GNU tests for fmt
diff --git a/.github/workflows/GnuTests.yml b/.github/workflows/GnuTests.yml
@@ -166,6 +166,7 @@ jobs:
         sudo locale-gen --keep-existing sv_SE
         sudo locale-gen --keep-existing sv_SE.UTF-8
         sudo locale-gen --keep-existing en_US
+        sudo locale-gen --keep-existing en_US.UTF-8
         sudo locale-gen --keep-existing ru_RU.KOI8-R
 
         sudo update-locale
diff --git a/src/uu/fmt/src/linebreak.rs b/src/uu/fmt/src/linebreak.rs
@@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
     let mut next_active_breaks = vec![];
 
     let stretch = args.opts.width - args.opts.goal;
-    let minlength = args.opts.goal.max(stretch + 1) - stretch;
+    let minlength = if args.opts.goal <= 10 {
+        1
+    } else {
+        args.opts.goal.max(stretch + 1) - stretch
+    };
     let mut new_linebreaks = vec![];
     let mut is_sentence_start = false;
     let mut least_demerits = 0;
@@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo
 const BAD_INFTY: i64 = 10_000_000;
 const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
 // badness = BAD_MULT * abs(r) ^ 3
-const BAD_MULT: f32 = 100.0;
+const BAD_MULT: f32 = 200.0;
 // DR_MULT is multiplier for delta-R between lines
 const DR_MULT: f32 = 600.0;
 // DL_MULT is penalty multiplier for short words at end of line
-const DL_MULT: f32 = 300.0;
+const DL_MULT: f32 = 10.0;
 
 fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
     // how much stretch are we using?
diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs
@@ -26,6 +26,14 @@ fn char_width(c: char) -> usize {
     }
 }
 
+// GNU fmt has a more restrictive definition of whitespace than Unicode.
+// It only considers ASCII whitespace characters (space, tab, newline, etc.)
+// and excludes many Unicode whitespace characters like non-breaking spaces.
+fn is_fmt_whitespace(c: char) -> bool {
+    // Only ASCII whitespace characters are considered whitespace in GNU fmt
+    matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
+}
+
 // lines with PSKIP, lacking PREFIX, or which are entirely blank are
 // NoFormatLines; otherwise, they are FormatLines
 #[derive(Debug)]
@@ -109,7 +117,7 @@ impl FileLines<'_> {
             for (i, char) in line.char_indices() {
                 if line[i..].starts_with(pfx) {
                     return (true, i);
-                } else if !char.is_whitespace() {
+                } else if !is_fmt_whitespace(char) {
                     break;
                 }
             }
@@ -128,7 +136,7 @@ impl FileLines<'_> {
                 prefix_len = indent_len;
             }
 
-            if (os >= prefix_end) && !c.is_whitespace() {
+            if (os >= prefix_end) && !is_fmt_whitespace(c) {
                 // found first non-whitespace after prefix, this is indent_end
                 indent_end = os;
                 break;
@@ -154,7 +162,7 @@ impl Iterator for FileLines<'_> {
         // emit a blank line
         // Err(true) indicates that this was a linebreak,
         // which is important to know when detecting mail headers
-        if n.chars().all(char::is_whitespace) {
+        if n.chars().all(is_fmt_whitespace) {
             return Some(Line::NoFormatLine(String::new(), true));
         }
 
@@ -174,7 +182,7 @@ impl Iterator for FileLines<'_> {
         if pmatch
             && n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
                 .chars()
-                .all(char::is_whitespace)
+                .all(is_fmt_whitespace)
         {
             return Some(Line::NoFormatLine(n, false));
         }
@@ -498,7 +506,7 @@ impl WordSplit<'_> {
         let mut aftertab = 0;
         let mut word_start = None;
         for (os, c) in string.char_indices() {
-            if !c.is_whitespace() {
+            if !is_fmt_whitespace(c) {
                 word_start = Some(os);
                 break;
             } else if c == '\t' {
@@ -519,7 +527,7 @@ impl WordSplit<'_> {
 impl WordSplit<'_> {
     fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
         // wordsplits *must* start at a non-whitespace character
-        let trim_string = string.trim_start();
+        let trim_string = string.trim_start_matches(is_fmt_whitespace);
         WordSplit {
             opts,
             string: trim_string,
@@ -571,7 +579,7 @@ impl<'a> Iterator for WordSplit<'a> {
         // points to whitespace character OR end of string
         let mut word_nchars = 0;
         self.position = match self.string[word_start..].find(|x: char| {
-            if x.is_whitespace() {
+            if is_fmt_whitespace(x) {
                 true
             } else {
                 word_nchars += char_width(x);
diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs
@@ -2,6 +2,9 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
+
+// spell-checker:ignore plass samp
+
 use uutests::new_ucmd;
 use uutests::util::TestScenario;
 use uutests::util_name;
@@ -303,3 +306,73 @@ fn prefix_equal_skip_prefix_equal_two() {
             .stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt");
     }
 }
+
+#[test]
+fn test_fmt_unicode_whitespace_handling() {
+    // Character classification fix: Test that Unicode whitespace characters like non-breaking space
+    // are NOT treated as whitespace by fmt, maintaining GNU fmt compatibility.
+    // GNU fmt only recognizes ASCII whitespace (space, tab, newline, etc.) and excludes
+    // Unicode whitespace characters to ensure consistent formatting behavior.
+    // This prevents regression of the character classification fix
+    let non_breaking_space = "\u{00A0}"; // U+00A0 NO-BREAK SPACE
+    let figure_space = "\u{2007}"; // U+2007 FIGURE SPACE
+    let narrow_no_break_space = "\u{202F}"; // U+202F NARROW NO-BREAK SPACE
+
+    // When fmt splits on width=1, these characters should NOT cause line breaks
+    // because they should not be considered whitespace
+    for (name, char) in [
+        ("non-breaking space", non_breaking_space),
+        ("figure space", figure_space),
+        ("narrow no-break space", narrow_no_break_space),
+    ] {
+        let input = format!("={char}=");
+        let result = new_ucmd!()
+            .args(&["-s", "-w1"])
+            .pipe_in(input.as_bytes())
+            .succeeds();
+
+        // Should be 1 line since the Unicode char is not treated as whitespace
+        assert_eq!(
+            result.stdout_str().lines().count(),
+            1,
+            "Failed for {name}: Unicode character should not be treated as whitespace"
+        );
+    }
+}
+
+#[test]
+fn test_fmt_knuth_plass_line_breaking() {
+    // Line breaking algorithm improvements: Test the enhanced Knuth-Plass optimal line breaking
+    // algorithm that better handles sentence boundaries, word positioning constraints,
+    // and produces more natural line breaks for complex text formatting.
+    // This prevents regression of the line breaking algorithm improvements
+    let input = "@command{fmt} prefers breaking lines at the end of a sentence, and tries to\n\
+                avoid line breaks after the first word of a sentence or before the last word\n\
+                of a sentence.  A @dfn{sentence break} is defined as either the end of a\n\
+                paragraph or a word ending in any of @samp{.?!}, followed by two spaces or end\n\
+                of line, ignoring any intervening parentheses or quotes.  Like @TeX{},\n\
+                @command{fmt} reads entire ''paragraphs'' before choosing line breaks; the\n\
+                algorithm is a variant of that given by\n\
+                Donald E. Knuth and Michael F. Plass\n\
+                in ''Breaking Paragraphs Into Lines'',\n\
+                @cite{Software---Practice & Experience}\n\
+                @b{11}, 11 (November 1981), 1119--1184.";
+
+    let expected = "@command{fmt} prefers breaking lines at the end of a sentence,\n\
+                   and tries to avoid line breaks after the first word of a sentence\n\
+                   or before the last word of a sentence.  A @dfn{sentence break}\n\
+                   is defined as either the end of a paragraph or a word ending\n\
+                   in any of @samp{.?!}, followed by two spaces or end of line,\n\
+                   ignoring any intervening parentheses or quotes.  Like @TeX{},\n\
+                   @command{fmt} reads entire ''paragraphs'' before choosing line\n\
+                   breaks; the algorithm is a variant of that given by Donald\n\
+                   E. Knuth and Michael F. Plass in ''Breaking Paragraphs Into\n\
+                   Lines'', @cite{Software---Practice & Experience} @b{11}, 11\n\
+                   (November 1981), 1119--1184.\n";
+
+    new_ucmd!()
+        .args(&["-g", "60", "-w", "72"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(expected);
+}