fix(wc): respect C/POSIX locale for character counting

naoNao89 · naoNao89 · commit bf040967c4be · 2026-02-18T20:02:02.000+07:00
Modify wc -m to count bytes instead of UTF-8 characters when LC_ALL, LC_CTYPE, or LANG is set to C or POSIX. This matches GNU coreutils behavior where MB_CUR_MAX == 1 in these locales. Changes: - Add is_c_or_posix_locale() helper in count_fast.rs - Export and reuse function in wc.rs to avoid duplication - Update fast path and UTF-8 decoding path - Add regression tests with Vietnamese text Fixes #9712, fixes #5831.
diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs
@@ -3,12 +3,30 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-// cSpell:ignore sysconf
+// spell-checker:ignore sysconf CTYPE
 use crate::{wc_simd_allowed, word_count::WordCount};
 use uucore::hardware::SimdPolicy;
 
 use super::WordCountable;
 
+/// Check if the current locale is C or POSIX (where characters == bytes).
+/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
+pub(crate) fn is_c_or_posix_locale() -> bool {
+    // Check LC_ALL, LC_CTYPE, and LANG in order of precedence
+    let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
+        .iter()
+        .find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));
+
+    if let Some(locale) = locale_val {
+        // Extract the base locale name (before any '.' or '@')
+        let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
+        base_locale == "C" || base_locale == "POSIX"
+    } else {
+        // No locale set, default to POSIX behavior (chars == bytes)
+        true
+    }
+}
+
 #[cfg(any(target_os = "linux", target_os = "android"))]
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind, Read};
@@ -235,6 +253,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
     let buf: &mut [u8] = &mut AlignedBuffer::default().data;
     let policy = SimdPolicy::detect();
     let simd_allowed = wc_simd_allowed(policy);
+
+    // In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
+    // This follows GNU coreutils behavior.
+    let chars_are_bytes = is_c_or_posix_locale();
+
     loop {
         match handle.read(buf) {
             Ok(0) => return (total, None),
@@ -243,11 +266,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
                     total.bytes += n;
                 }
                 if COUNT_CHARS {
-                    total.chars += if simd_allowed {
-                        bytecount::num_chars(&buf[..n])
+                    if chars_are_bytes {
+                        // In C/POSIX locale, count bytes instead of UTF-8 chars
+                        total.chars += n;
                     } else {
-                        bytecount::naive_num_chars(&buf[..n])
-                    };
+                        total.chars += if simd_allowed {
+                            bytecount::num_chars(&buf[..n])
+                        } else {
+                            bytecount::naive_num_chars(&buf[..n])
+                        };
+                    }
                 }
                 if COUNT_LINES {
                     total.lines += if simd_allowed {
diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs
@@ -3,7 +3,7 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-// cSpell:ignore ilog wc wc's
+// spell-checker:ignore ilog wc wc's
 
 mod count_fast;
 mod countable;
@@ -37,7 +37,7 @@ use uucore::{
 };
 
 use crate::{
-    count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
+    count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
     countable::WordCountable,
     word_count::WordCount,
 };
@@ -580,6 +580,7 @@ fn process_chunk<
     current_len: &mut usize,
     in_word: &mut bool,
     posixly_correct: bool,
+    chars_are_bytes: bool,
 ) {
     for ch in text.chars() {
         if SHOW_WORDS {
@@ -615,12 +616,17 @@ fn process_chunk<
         if SHOW_LINES && ch == '\n' {
             total.lines += 1;
         }
-        if SHOW_CHARS {
+        if SHOW_CHARS && !chars_are_bytes {
             total.chars += 1;
         }
     }
     total.bytes += text.len();
 
+    // In C/POSIX locale, chars count equals bytes count
+    if SHOW_CHARS && chars_are_bytes {
+        total.chars += text.len();
+    }
+
     total.max_line_length = max(*current_len, total.max_line_length);
 }
 
@@ -656,6 +662,7 @@ fn word_count_from_reader_specialized<
     let mut in_word = false;
     let mut current_len = 0;
     let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some();
+    let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
     while let Some(chunk) = reader.next_strict() {
         match chunk {
             Ok(text) => {
@@ -665,6 +672,7 @@ fn word_count_from_reader_specialized<
                     &mut current_len,
                     &mut in_word,
                     posixly_correct,
+                    chars_are_bytes,
                 );
             }
             Err(e) => {
diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs
@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
 use uutests::new_ucmd;
 use uutests::util::vec_of_size;
 
-// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
+// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
+// spell-checker:ignore (Vietnamese) Tiếng Việt chào
 #[test]
 fn test_invalid_arg() {
     new_ucmd!().arg("--definitely-invalid").fails_with_code(1);
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
 
 #[test]
 fn test_utf8() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .args(&["-lwmcL"])
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_test.txt")
         .succeeds()
         .stdout_is("    303    2178   22457   23025      79\n");
@@ -88,35 +91,43 @@ fn test_utf8_line_length_words() {
 
 #[test]
 fn test_utf8_line_length_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Lm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("    442      48\n");
 }
 
 #[test]
 fn test_utf8_line_length_chars_words() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Lmw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     89     442      48\n");
 }
 
 #[test]
 fn test_utf8_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-m")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("442\n");
 }
 
 #[test]
 fn test_utf8_bytes_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-cm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("    442     513\n");
@@ -133,17 +144,21 @@ fn test_utf8_bytes_lines() {
 
 #[test]
 fn test_utf8_bytes_chars_lines() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-cml")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442     513\n");
 }
 
 #[test]
 fn test_utf8_chars_words() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-mw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     89     442\n");
@@ -169,35 +184,43 @@ fn test_utf8_line_length_lines_words() {
 
 #[test]
 fn test_utf8_lines_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-ml")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442\n");
 }
 
 #[test]
 fn test_utf8_lines_words_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-mlw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25      89     442\n");
 }
 
 #[test]
 fn test_utf8_line_length_lines_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Llm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442      48\n");
 }
 
 #[test]
 fn test_utf8_all() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-lwmcL")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25      89     442     513      48\n");
@@ -921,3 +944,136 @@ fn test_posixly_correct_whitespace() {
         .succeeds()
         .stdout_is("1\n");
 }
+
+#[test]
+fn test_wc_chars_c_locale() {
+    // In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
+    // Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
+    // "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
+    let vietnamese_text = "Tiếng";
+
+    // With LC_ALL=C, chars should equal bytes (7)
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "C")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+
+    // Same with LC_ALL=POSIX
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "POSIX")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+
+    // Test combined with bytes flag - should show same count
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      7       7\n");
+}
+
+#[test]
+fn test_wc_chars_utf8_locale() {
+    // In UTF-8 locale, wc -m should count UTF-8 characters
+    // Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
+    let vietnamese_text = "Tiếng";
+
+    // With vi_VN.UTF-8 locale, chars should be 5 (not 7)
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("5\n");
+
+    // Test combined with bytes flag - should show different counts
+    // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      5       7\n");
+}
+
+#[test]
+fn test_wc_chars_default_locale() {
+    // When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
+    // This ensures backward compatibility
+    let vietnamese_text = "Tiếng";
+
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "")
+        .env("LC_CTYPE", "")
+        .env("LANG", "")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+}
+
+#[test]
+fn test_wc_multibyte_c_locale() {
+    // Issue #9712 and #5831: Test various multibyte characters in C locale
+    // All should be counted as bytes
+
+    // Vietnamese text with multiple diacritics: "Tiếng Việt"
+    // 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("Tiếng Việt")
+        .succeeds()
+        .stdout_is("     14      14\n");
+
+    // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("ệ")
+        .succeeds()
+        .stdout_is("      3       3\n");
+
+    // Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("Xin chào")
+        .succeeds()
+        .stdout_is("      9       9\n");
+}
+
+#[test]
+fn test_wc_multibyte_utf8_locale() {
+    // In UTF-8 locale, multibyte characters should be counted correctly
+    // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
+
+    // Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("Tiếng Việt")
+        .succeeds()
+        .stdout_is("     10      14\n");
+
+    // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("ệ")
+        .succeeds()
+        .stdout_is("      1       3\n");
+
+    // Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("Xin chào")
+        .succeeds()
+        .stdout_is("      8       9\n");
+}