Skip to content

Commit e0eab6d

Browse files
committed
fix(wc): respect C/POSIX locale for character counting
Modify wc -m to count bytes instead of UTF-8 characters when LC_ALL, LC_CTYPE, or LANG is set to C or POSIX. This matches GNU coreutils behavior where MB_CUR_MAX == 1 in these locales. Changes: - Add is_c_or_posix_locale() helper in count_fast.rs - Export and reuse function in wc.rs to avoid duplication - Update fast path and UTF-8 decoding path - Add regression tests with Vietnamese text Fixes #9712, fixes #5831.
1 parent 19c7f64 commit e0eab6d

3 files changed

Lines changed: 201 additions & 9 deletions

File tree

src/uu/wc/src/count_fast.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,30 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// cSpell:ignore sysconf
6+
// spell-checker:ignore sysconf CTYPE
77
use crate::{wc_simd_allowed, word_count::WordCount};
88
use uucore::hardware::SimdPolicy;
99

1010
use super::WordCountable;
1111

12+
/// Check if the current locale is C or POSIX (where characters == bytes).
13+
/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
14+
pub(crate) fn is_c_or_posix_locale() -> bool {
15+
// Check LC_ALL, LC_CTYPE, and LANG in order of precedence
16+
let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
17+
.iter()
18+
.find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));
19+
20+
if let Some(locale) = locale_val {
21+
// Extract the base locale name (before any '.' or '@')
22+
let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
23+
base_locale == "C" || base_locale == "POSIX"
24+
} else {
25+
// No locale set, default to POSIX behavior (chars == bytes)
26+
true
27+
}
28+
}
29+
1230
use std::io::{self, ErrorKind, Read};
1331

1432
#[cfg(unix)]
@@ -221,6 +239,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
221239
let buf: &mut [u8] = &mut AlignedBuffer::default().data;
222240
let policy = SimdPolicy::detect();
223241
let simd_allowed = wc_simd_allowed(policy);
242+
243+
// In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
244+
// This follows GNU coreutils behavior.
245+
let chars_are_bytes = is_c_or_posix_locale();
246+
224247
loop {
225248
match handle.read(buf) {
226249
Ok(0) => return (total, None),
@@ -229,11 +252,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
229252
total.bytes += n;
230253
}
231254
if COUNT_CHARS {
232-
total.chars += if simd_allowed {
233-
bytecount::num_chars(&buf[..n])
255+
if chars_are_bytes {
256+
// In C/POSIX locale, count bytes instead of UTF-8 chars
257+
total.chars += n;
234258
} else {
235-
bytecount::naive_num_chars(&buf[..n])
236-
};
259+
total.chars += if simd_allowed {
260+
bytecount::num_chars(&buf[..n])
261+
} else {
262+
bytecount::naive_num_chars(&buf[..n])
263+
};
264+
}
237265
}
238266
if COUNT_LINES {
239267
total.lines += if simd_allowed {

src/uu/wc/src/wc.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// cSpell:ignore ilog wc wc's
6+
// spell-checker:ignore ilog wc wc's
77

88
mod count_fast;
99
mod countable;
@@ -38,7 +38,7 @@ use uucore::{
3838
};
3939

4040
use crate::{
41-
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
41+
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
4242
countable::WordCountable,
4343
word_count::WordCount,
4444
};
@@ -581,6 +581,7 @@ fn process_chunk<
581581
current_len: &mut usize,
582582
in_word: &mut bool,
583583
is_posixly_correct: bool,
584+
chars_are_bytes: bool,
584585
) {
585586
for ch in text.chars() {
586587
if SHOW_WORDS {
@@ -616,12 +617,17 @@ fn process_chunk<
616617
if SHOW_LINES && ch == '\n' {
617618
total.lines += 1;
618619
}
619-
if SHOW_CHARS {
620+
if SHOW_CHARS && !chars_are_bytes {
620621
total.chars += 1;
621622
}
622623
}
623624
total.bytes += text.len();
624625

626+
// In C/POSIX locale, chars count equals bytes count
627+
if SHOW_CHARS && chars_are_bytes {
628+
total.chars += text.len();
629+
}
630+
625631
total.max_line_length = max(*current_len, total.max_line_length);
626632
}
627633

@@ -657,6 +663,7 @@ fn word_count_from_reader_specialized<
657663
let mut in_word = false;
658664
let mut current_len = 0;
659665
let is_posixly_correct = *IS_POSIXLY_CORRECT;
666+
let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
660667
while let Some(chunk) = reader.next_strict() {
661668
match chunk {
662669
Ok(text) => {
@@ -666,6 +673,7 @@ fn word_count_from_reader_specialized<
666673
&mut current_len,
667674
&mut in_word,
668675
is_posixly_correct,
676+
chars_are_bytes,
669677
);
670678
}
671679
Err(e) => {

tests/by-util/test_wc.rs

Lines changed: 157 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
88
use uutests::new_ucmd;
99
use uutests::util::vec_of_size;
1010

11-
// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
11+
// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
12+
// spell-checker:ignore (Vietnamese) Tiếng Việt chào
1213
#[test]
1314
fn test_invalid_arg() {
1415
new_ucmd!().arg("--definitely-invalid").fails_with_code(1);
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
6162

6263
#[test]
6364
fn test_utf8() {
65+
// Requires UTF-8 locale for character counting
6466
new_ucmd!()
6567
.args(&["-lwmcL"])
68+
.env("LC_ALL", "en_US.UTF-8")
6669
.pipe_in_fixture("UTF_8_test.txt")
6770
.succeeds()
6871
.stdout_is(" 303 2178 22457 23025 79\n");
@@ -88,35 +91,43 @@ fn test_utf8_line_length_words() {
8891

8992
#[test]
9093
fn test_utf8_line_length_chars() {
94+
// Requires UTF-8 locale for character counting
9195
new_ucmd!()
9296
.arg("-Lm")
97+
.env("LC_ALL", "en_US.UTF-8")
9398
.pipe_in_fixture("UTF_8_weirdchars.txt")
9499
.succeeds()
95100
.stdout_is(" 442 48\n");
96101
}
97102

98103
#[test]
99104
fn test_utf8_line_length_chars_words() {
105+
// Requires UTF-8 locale for character counting
100106
new_ucmd!()
101107
.arg("-Lmw")
108+
.env("LC_ALL", "en_US.UTF-8")
102109
.pipe_in_fixture("UTF_8_weirdchars.txt")
103110
.succeeds()
104111
.stdout_is(" 89 442 48\n");
105112
}
106113

107114
#[test]
108115
fn test_utf8_chars() {
116+
// Requires UTF-8 locale for character counting
109117
new_ucmd!()
110118
.arg("-m")
119+
.env("LC_ALL", "en_US.UTF-8")
111120
.pipe_in_fixture("UTF_8_weirdchars.txt")
112121
.succeeds()
113122
.stdout_is("442\n");
114123
}
115124

116125
#[test]
117126
fn test_utf8_bytes_chars() {
127+
// Requires UTF-8 locale for character counting
118128
new_ucmd!()
119129
.arg("-cm")
130+
.env("LC_ALL", "en_US.UTF-8")
120131
.pipe_in_fixture("UTF_8_weirdchars.txt")
121132
.succeeds()
122133
.stdout_is(" 442 513\n");
@@ -133,17 +144,21 @@ fn test_utf8_bytes_lines() {
133144

134145
#[test]
135146
fn test_utf8_bytes_chars_lines() {
147+
// Requires UTF-8 locale for character counting
136148
new_ucmd!()
137149
.arg("-cml")
150+
.env("LC_ALL", "en_US.UTF-8")
138151
.pipe_in_fixture("UTF_8_weirdchars.txt")
139152
.succeeds()
140153
.stdout_is(" 25 442 513\n");
141154
}
142155

143156
#[test]
144157
fn test_utf8_chars_words() {
158+
// Requires UTF-8 locale for character counting
145159
new_ucmd!()
146160
.arg("-mw")
161+
.env("LC_ALL", "en_US.UTF-8")
147162
.pipe_in_fixture("UTF_8_weirdchars.txt")
148163
.succeeds()
149164
.stdout_is(" 89 442\n");
@@ -169,35 +184,43 @@ fn test_utf8_line_length_lines_words() {
169184

170185
#[test]
171186
fn test_utf8_lines_chars() {
187+
// Requires UTF-8 locale for character counting
172188
new_ucmd!()
173189
.arg("-ml")
190+
.env("LC_ALL", "en_US.UTF-8")
174191
.pipe_in_fixture("UTF_8_weirdchars.txt")
175192
.succeeds()
176193
.stdout_is(" 25 442\n");
177194
}
178195

179196
#[test]
180197
fn test_utf8_lines_words_chars() {
198+
// Requires UTF-8 locale for character counting
181199
new_ucmd!()
182200
.arg("-mlw")
201+
.env("LC_ALL", "en_US.UTF-8")
183202
.pipe_in_fixture("UTF_8_weirdchars.txt")
184203
.succeeds()
185204
.stdout_is(" 25 89 442\n");
186205
}
187206

188207
#[test]
189208
fn test_utf8_line_length_lines_chars() {
209+
// Requires UTF-8 locale for character counting
190210
new_ucmd!()
191211
.arg("-Llm")
212+
.env("LC_ALL", "en_US.UTF-8")
192213
.pipe_in_fixture("UTF_8_weirdchars.txt")
193214
.succeeds()
194215
.stdout_is(" 25 442 48\n");
195216
}
196217

197218
#[test]
198219
fn test_utf8_all() {
220+
// Requires UTF-8 locale for character counting
199221
new_ucmd!()
200222
.arg("-lwmcL")
223+
.env("LC_ALL", "en_US.UTF-8")
201224
.pipe_in_fixture("UTF_8_weirdchars.txt")
202225
.succeeds()
203226
.stdout_is(" 25 89 442 513 48\n");
@@ -958,3 +981,136 @@ fn test_posixly_correct_whitespace() {
958981
.succeeds()
959982
.stdout_is("1\n");
960983
}
984+
985+
#[test]
986+
fn test_wc_chars_c_locale() {
987+
// In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
988+
// Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
989+
// "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
990+
let vietnamese_text = "Tiếng";
991+
992+
// With LC_ALL=C, chars should equal bytes (7)
993+
new_ucmd!()
994+
.arg("-m")
995+
.env("LC_ALL", "C")
996+
.pipe_in(vietnamese_text)
997+
.succeeds()
998+
.stdout_is("7\n");
999+
1000+
// Same with LC_ALL=POSIX
1001+
new_ucmd!()
1002+
.arg("-m")
1003+
.env("LC_ALL", "POSIX")
1004+
.pipe_in(vietnamese_text)
1005+
.succeeds()
1006+
.stdout_is("7\n");
1007+
1008+
// Test combined with bytes flag - should show same count
1009+
new_ucmd!()
1010+
.args(&["-cm"])
1011+
.env("LC_ALL", "C")
1012+
.pipe_in(vietnamese_text)
1013+
.succeeds()
1014+
.stdout_is(" 7 7\n");
1015+
}
1016+
1017+
#[test]
1018+
fn test_wc_chars_utf8_locale() {
1019+
// In UTF-8 locale, wc -m should count UTF-8 characters
1020+
// Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
1021+
let vietnamese_text = "Tiếng";
1022+
1023+
// With vi_VN.UTF-8 locale, chars should be 5 (not 7)
1024+
new_ucmd!()
1025+
.arg("-m")
1026+
.env("LC_ALL", "vi_VN.UTF-8")
1027+
.pipe_in(vietnamese_text)
1028+
.succeeds()
1029+
.stdout_is("5\n");
1030+
1031+
// Test combined with bytes flag - should show different counts
1032+
// Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1033+
new_ucmd!()
1034+
.args(&["-cm"])
1035+
.env("LC_ALL", "vi_VN.UTF-8")
1036+
.pipe_in(vietnamese_text)
1037+
.succeeds()
1038+
.stdout_is(" 5 7\n");
1039+
}
1040+
1041+
#[test]
1042+
fn test_wc_chars_default_locale() {
1043+
// When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
1044+
// This ensures backward compatibility
1045+
let vietnamese_text = "Tiếng";
1046+
1047+
new_ucmd!()
1048+
.arg("-m")
1049+
.env("LC_ALL", "")
1050+
.env("LC_CTYPE", "")
1051+
.env("LANG", "")
1052+
.pipe_in(vietnamese_text)
1053+
.succeeds()
1054+
.stdout_is("7\n");
1055+
}
1056+
1057+
#[test]
1058+
fn test_wc_multibyte_c_locale() {
1059+
// Issue #9712 and #5831: Test various multibyte characters in C locale
1060+
// All should be counted as bytes
1061+
1062+
// Vietnamese text with multiple diacritics: "Tiếng Việt"
1063+
// 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1064+
new_ucmd!()
1065+
.args(&["-cm"])
1066+
.env("LC_ALL", "C")
1067+
.pipe_in("Tiếng Việt")
1068+
.succeeds()
1069+
.stdout_is(" 14 14\n");
1070+
1071+
// Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1072+
new_ucmd!()
1073+
.args(&["-cm"])
1074+
.env("LC_ALL", "C")
1075+
.pipe_in("ệ")
1076+
.succeeds()
1077+
.stdout_is(" 3 3\n");
1078+
1079+
// Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
1080+
new_ucmd!()
1081+
.args(&["-cm"])
1082+
.env("LC_ALL", "C")
1083+
.pipe_in("Xin chào")
1084+
.succeeds()
1085+
.stdout_is(" 9 9\n");
1086+
}
1087+
1088+
#[test]
1089+
fn test_wc_multibyte_utf8_locale() {
1090+
// In UTF-8 locale, multibyte characters should be counted correctly
1091+
// Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1092+
1093+
// Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1094+
new_ucmd!()
1095+
.args(&["-cm"])
1096+
.env("LC_ALL", "vi_VN.UTF-8")
1097+
.pipe_in("Tiếng Việt")
1098+
.succeeds()
1099+
.stdout_is(" 10 14\n");
1100+
1101+
// Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1102+
new_ucmd!()
1103+
.args(&["-cm"])
1104+
.env("LC_ALL", "vi_VN.UTF-8")
1105+
.pipe_in("ệ")
1106+
.succeeds()
1107+
.stdout_is(" 1 3\n");
1108+
1109+
// Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
1110+
new_ucmd!()
1111+
.args(&["-cm"])
1112+
.env("LC_ALL", "vi_VN.UTF-8")
1113+
.pipe_in("Xin chào")
1114+
.succeeds()
1115+
.stdout_is(" 8 9\n");
1116+
}

0 commit comments

Comments
 (0)