Skip to content

Commit bf04096

Browse files
committed
fix(wc): respect C/POSIX locale for character counting
Modify wc -m to count bytes instead of UTF-8 characters when LC_ALL, LC_CTYPE, or LANG is set to C or POSIX. This matches GNU coreutils behavior where MB_CUR_MAX == 1 in these locales. Changes: - Add is_c_or_posix_locale() helper in count_fast.rs - Export and reuse function in wc.rs to avoid duplication - Update fast path and UTF-8 decoding path - Add regression tests with Vietnamese text Fixes #9712, fixes #5831.
1 parent 1e95554 commit bf04096

3 files changed

Lines changed: 201 additions & 9 deletions

File tree

src/uu/wc/src/count_fast.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,30 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// cSpell:ignore sysconf
6+
// spell-checker:ignore sysconf CTYPE
77
use crate::{wc_simd_allowed, word_count::WordCount};
88
use uucore::hardware::SimdPolicy;
99

1010
use super::WordCountable;
1111

12+
/// Check if the current locale is C or POSIX (where characters == bytes).
13+
/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
14+
pub(crate) fn is_c_or_posix_locale() -> bool {
15+
// Check LC_ALL, LC_CTYPE, and LANG in order of precedence
16+
let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
17+
.iter()
18+
.find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));
19+
20+
if let Some(locale) = locale_val {
21+
// Extract the base locale name (before any '.' or '@')
22+
let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
23+
base_locale == "C" || base_locale == "POSIX"
24+
} else {
25+
// No locale set, default to POSIX behavior (chars == bytes)
26+
true
27+
}
28+
}
29+
1230
#[cfg(any(target_os = "linux", target_os = "android"))]
1331
use std::fs::OpenOptions;
1432
use std::io::{self, ErrorKind, Read};
@@ -235,6 +253,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
235253
let buf: &mut [u8] = &mut AlignedBuffer::default().data;
236254
let policy = SimdPolicy::detect();
237255
let simd_allowed = wc_simd_allowed(policy);
256+
257+
// In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
258+
// This follows GNU coreutils behavior.
259+
let chars_are_bytes = is_c_or_posix_locale();
260+
238261
loop {
239262
match handle.read(buf) {
240263
Ok(0) => return (total, None),
@@ -243,11 +266,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
243266
total.bytes += n;
244267
}
245268
if COUNT_CHARS {
246-
total.chars += if simd_allowed {
247-
bytecount::num_chars(&buf[..n])
269+
if chars_are_bytes {
270+
// In C/POSIX locale, count bytes instead of UTF-8 chars
271+
total.chars += n;
248272
} else {
249-
bytecount::naive_num_chars(&buf[..n])
250-
};
273+
total.chars += if simd_allowed {
274+
bytecount::num_chars(&buf[..n])
275+
} else {
276+
bytecount::naive_num_chars(&buf[..n])
277+
};
278+
}
251279
}
252280
if COUNT_LINES {
253281
total.lines += if simd_allowed {

src/uu/wc/src/wc.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// cSpell:ignore ilog wc wc's
6+
// spell-checker:ignore ilog wc wc's
77

88
mod count_fast;
99
mod countable;
@@ -37,7 +37,7 @@ use uucore::{
3737
};
3838

3939
use crate::{
40-
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
40+
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
4141
countable::WordCountable,
4242
word_count::WordCount,
4343
};
@@ -580,6 +580,7 @@ fn process_chunk<
580580
current_len: &mut usize,
581581
in_word: &mut bool,
582582
posixly_correct: bool,
583+
chars_are_bytes: bool,
583584
) {
584585
for ch in text.chars() {
585586
if SHOW_WORDS {
@@ -615,12 +616,17 @@ fn process_chunk<
615616
if SHOW_LINES && ch == '\n' {
616617
total.lines += 1;
617618
}
618-
if SHOW_CHARS {
619+
if SHOW_CHARS && !chars_are_bytes {
619620
total.chars += 1;
620621
}
621622
}
622623
total.bytes += text.len();
623624

625+
// In C/POSIX locale, chars count equals bytes count
626+
if SHOW_CHARS && chars_are_bytes {
627+
total.chars += text.len();
628+
}
629+
624630
total.max_line_length = max(*current_len, total.max_line_length);
625631
}
626632

@@ -656,6 +662,7 @@ fn word_count_from_reader_specialized<
656662
let mut in_word = false;
657663
let mut current_len = 0;
658664
let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some();
665+
let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
659666
while let Some(chunk) = reader.next_strict() {
660667
match chunk {
661668
Ok(text) => {
@@ -665,6 +672,7 @@ fn word_count_from_reader_specialized<
665672
&mut current_len,
666673
&mut in_word,
667674
posixly_correct,
675+
chars_are_bytes,
668676
);
669677
}
670678
Err(e) => {

tests/by-util/test_wc.rs

Lines changed: 157 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
88
use uutests::new_ucmd;
99
use uutests::util::vec_of_size;
1010

11-
// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
11+
// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
12+
// spell-checker:ignore (Vietnamese) Tiếng Việt chào
1213
#[test]
1314
fn test_invalid_arg() {
1415
new_ucmd!().arg("--definitely-invalid").fails_with_code(1);
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
6162

6263
#[test]
6364
fn test_utf8() {
65+
// Requires UTF-8 locale for character counting
6466
new_ucmd!()
6567
.args(&["-lwmcL"])
68+
.env("LC_ALL", "en_US.UTF-8")
6669
.pipe_in_fixture("UTF_8_test.txt")
6770
.succeeds()
6871
.stdout_is(" 303 2178 22457 23025 79\n");
@@ -88,35 +91,43 @@ fn test_utf8_line_length_words() {
8891

8992
#[test]
9093
fn test_utf8_line_length_chars() {
94+
// Requires UTF-8 locale for character counting
9195
new_ucmd!()
9296
.arg("-Lm")
97+
.env("LC_ALL", "en_US.UTF-8")
9398
.pipe_in_fixture("UTF_8_weirdchars.txt")
9499
.succeeds()
95100
.stdout_is(" 442 48\n");
96101
}
97102

98103
#[test]
99104
fn test_utf8_line_length_chars_words() {
105+
// Requires UTF-8 locale for character counting
100106
new_ucmd!()
101107
.arg("-Lmw")
108+
.env("LC_ALL", "en_US.UTF-8")
102109
.pipe_in_fixture("UTF_8_weirdchars.txt")
103110
.succeeds()
104111
.stdout_is(" 89 442 48\n");
105112
}
106113

107114
#[test]
108115
fn test_utf8_chars() {
116+
// Requires UTF-8 locale for character counting
109117
new_ucmd!()
110118
.arg("-m")
119+
.env("LC_ALL", "en_US.UTF-8")
111120
.pipe_in_fixture("UTF_8_weirdchars.txt")
112121
.succeeds()
113122
.stdout_is("442\n");
114123
}
115124

116125
#[test]
117126
fn test_utf8_bytes_chars() {
127+
// Requires UTF-8 locale for character counting
118128
new_ucmd!()
119129
.arg("-cm")
130+
.env("LC_ALL", "en_US.UTF-8")
120131
.pipe_in_fixture("UTF_8_weirdchars.txt")
121132
.succeeds()
122133
.stdout_is(" 442 513\n");
@@ -133,17 +144,21 @@ fn test_utf8_bytes_lines() {
133144

134145
#[test]
135146
fn test_utf8_bytes_chars_lines() {
147+
// Requires UTF-8 locale for character counting
136148
new_ucmd!()
137149
.arg("-cml")
150+
.env("LC_ALL", "en_US.UTF-8")
138151
.pipe_in_fixture("UTF_8_weirdchars.txt")
139152
.succeeds()
140153
.stdout_is(" 25 442 513\n");
141154
}
142155

143156
#[test]
144157
fn test_utf8_chars_words() {
158+
// Requires UTF-8 locale for character counting
145159
new_ucmd!()
146160
.arg("-mw")
161+
.env("LC_ALL", "en_US.UTF-8")
147162
.pipe_in_fixture("UTF_8_weirdchars.txt")
148163
.succeeds()
149164
.stdout_is(" 89 442\n");
@@ -169,35 +184,43 @@ fn test_utf8_line_length_lines_words() {
169184

170185
#[test]
171186
fn test_utf8_lines_chars() {
187+
// Requires UTF-8 locale for character counting
172188
new_ucmd!()
173189
.arg("-ml")
190+
.env("LC_ALL", "en_US.UTF-8")
174191
.pipe_in_fixture("UTF_8_weirdchars.txt")
175192
.succeeds()
176193
.stdout_is(" 25 442\n");
177194
}
178195

179196
#[test]
180197
fn test_utf8_lines_words_chars() {
198+
// Requires UTF-8 locale for character counting
181199
new_ucmd!()
182200
.arg("-mlw")
201+
.env("LC_ALL", "en_US.UTF-8")
183202
.pipe_in_fixture("UTF_8_weirdchars.txt")
184203
.succeeds()
185204
.stdout_is(" 25 89 442\n");
186205
}
187206

188207
#[test]
189208
fn test_utf8_line_length_lines_chars() {
209+
// Requires UTF-8 locale for character counting
190210
new_ucmd!()
191211
.arg("-Llm")
212+
.env("LC_ALL", "en_US.UTF-8")
192213
.pipe_in_fixture("UTF_8_weirdchars.txt")
193214
.succeeds()
194215
.stdout_is(" 25 442 48\n");
195216
}
196217

197218
#[test]
198219
fn test_utf8_all() {
220+
// Requires UTF-8 locale for character counting
199221
new_ucmd!()
200222
.arg("-lwmcL")
223+
.env("LC_ALL", "en_US.UTF-8")
201224
.pipe_in_fixture("UTF_8_weirdchars.txt")
202225
.succeeds()
203226
.stdout_is(" 25 89 442 513 48\n");
@@ -921,3 +944,136 @@ fn test_posixly_correct_whitespace() {
921944
.succeeds()
922945
.stdout_is("1\n");
923946
}
947+
948+
#[test]
949+
fn test_wc_chars_c_locale() {
950+
// In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
951+
// Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
952+
// "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
953+
let vietnamese_text = "Tiếng";
954+
955+
// With LC_ALL=C, chars should equal bytes (7)
956+
new_ucmd!()
957+
.arg("-m")
958+
.env("LC_ALL", "C")
959+
.pipe_in(vietnamese_text)
960+
.succeeds()
961+
.stdout_is("7\n");
962+
963+
// Same with LC_ALL=POSIX
964+
new_ucmd!()
965+
.arg("-m")
966+
.env("LC_ALL", "POSIX")
967+
.pipe_in(vietnamese_text)
968+
.succeeds()
969+
.stdout_is("7\n");
970+
971+
// Test combined with bytes flag - should show same count
972+
new_ucmd!()
973+
.args(&["-cm"])
974+
.env("LC_ALL", "C")
975+
.pipe_in(vietnamese_text)
976+
.succeeds()
977+
.stdout_is(" 7 7\n");
978+
}
979+
980+
#[test]
981+
fn test_wc_chars_utf8_locale() {
982+
// In UTF-8 locale, wc -m should count UTF-8 characters
983+
// Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
984+
let vietnamese_text = "Tiếng";
985+
986+
// With vi_VN.UTF-8 locale, chars should be 5 (not 7)
987+
new_ucmd!()
988+
.arg("-m")
989+
.env("LC_ALL", "vi_VN.UTF-8")
990+
.pipe_in(vietnamese_text)
991+
.succeeds()
992+
.stdout_is("5\n");
993+
994+
// Test combined with bytes flag - should show different counts
995+
// Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
996+
new_ucmd!()
997+
.args(&["-cm"])
998+
.env("LC_ALL", "vi_VN.UTF-8")
999+
.pipe_in(vietnamese_text)
1000+
.succeeds()
1001+
.stdout_is(" 5 7\n");
1002+
}
1003+
1004+
#[test]
1005+
fn test_wc_chars_default_locale() {
1006+
// When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
1007+
// This ensures backward compatibility
1008+
let vietnamese_text = "Tiếng";
1009+
1010+
new_ucmd!()
1011+
.arg("-m")
1012+
.env("LC_ALL", "")
1013+
.env("LC_CTYPE", "")
1014+
.env("LANG", "")
1015+
.pipe_in(vietnamese_text)
1016+
.succeeds()
1017+
.stdout_is("7\n");
1018+
}
1019+
1020+
#[test]
1021+
fn test_wc_multibyte_c_locale() {
1022+
// Issue #9712 and #5831: Test various multibyte characters in C locale
1023+
// All should be counted as bytes
1024+
1025+
// Vietnamese text with multiple diacritics: "Tiếng Việt"
1026+
// 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1027+
new_ucmd!()
1028+
.args(&["-cm"])
1029+
.env("LC_ALL", "C")
1030+
.pipe_in("Tiếng Việt")
1031+
.succeeds()
1032+
.stdout_is(" 14 14\n");
1033+
1034+
// Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1035+
new_ucmd!()
1036+
.args(&["-cm"])
1037+
.env("LC_ALL", "C")
1038+
.pipe_in("ệ")
1039+
.succeeds()
1040+
.stdout_is(" 3 3\n");
1041+
1042+
// Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
1043+
new_ucmd!()
1044+
.args(&["-cm"])
1045+
.env("LC_ALL", "C")
1046+
.pipe_in("Xin chào")
1047+
.succeeds()
1048+
.stdout_is(" 9 9\n");
1049+
}
1050+
1051+
#[test]
1052+
fn test_wc_multibyte_utf8_locale() {
1053+
// In UTF-8 locale, multibyte characters should be counted correctly
1054+
// Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1055+
1056+
// Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1057+
new_ucmd!()
1058+
.args(&["-cm"])
1059+
.env("LC_ALL", "vi_VN.UTF-8")
1060+
.pipe_in("Tiếng Việt")
1061+
.succeeds()
1062+
.stdout_is(" 10 14\n");
1063+
1064+
// Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1065+
new_ucmd!()
1066+
.args(&["-cm"])
1067+
.env("LC_ALL", "vi_VN.UTF-8")
1068+
.pipe_in("ệ")
1069+
.succeeds()
1070+
.stdout_is(" 1 3\n");
1071+
1072+
// Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
1073+
new_ucmd!()
1074+
.args(&["-cm"])
1075+
.env("LC_ALL", "vi_VN.UTF-8")
1076+
.pipe_in("Xin chào")
1077+
.succeeds()
1078+
.stdout_is(" 8 9\n");
1079+
}

0 commit comments

Comments
 (0)