Skip to content

Commit 94376da

Browse files
committed
refactor(wc): deduplicate C locale detection
1 parent 3eb470f commit 94376da

6 files changed

Lines changed: 31 additions & 35 deletions

File tree

src/uu/uniq/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ doctest = false
2121

2222
[dependencies]
2323
clap = { workspace = true }
24-
uucore = { workspace = true, features = ["parser"] }
2524
fluent = { workspace = true }
25+
uucore = { workspace = true, features = ["i18n-charmap", "parser"] }
2626

2727
[dev-dependencies]
2828
divan = { workspace = true }

src/uu/uniq/src/uniq.rs

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::num::IntErrorKind;
1414
use uucore::display::Quotable;
1515
use uucore::error::{FromIo, UError, UResult, USimpleError};
1616
use uucore::format_usage;
17+
use uucore::i18n::charmap::is_effective_ctype_c_or_posix;
1718
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
1819
use uucore::posix::{OBSOLETE, posix_version};
1920
use uucore::translate;
@@ -186,14 +187,6 @@ impl Uniq {
186187
}
187188
}
188189

189-
fn is_c_locale() -> bool {
190-
["LC_ALL", "LC_CTYPE", "LANG"]
191-
.iter()
192-
.find_map(|&key| std::env::var_os(key))
193-
.filter(|v| !v.is_empty())
194-
.is_none_or(|v| v == "C" || v == "POSIX")
195-
}
196-
197190
fn key_end_index(&self, line: &[u8], key_start: usize) -> usize {
198191
let remainder = &line[key_start..];
199192
match self.slice_stop {
@@ -202,7 +195,7 @@ impl Uniq {
202195
if remainder.is_empty() {
203196
return key_start;
204197
}
205-
if Self::is_c_locale() {
198+
if is_effective_ctype_c_or_posix() {
206199
// for C or POSIX we count bytes
207200
key_start + remainder.len().min(limit)
208201
} else if let Ok(valid) = std::str::from_utf8(remainder) {

src/uu/wc/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ fluent = { workspace = true }
2626
thiserror = { workspace = true }
2727
uucore = { workspace = true, features = [
2828
"hardware",
29+
"i18n-charmap",
2930
"parser",
3031
"pipes",
3132
"quoting-style",

src/uu/wc/src/count_fast.rs

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,10 @@
66
// spell-checker:ignore sysconf CTYPE
77
use crate::{wc_simd_allowed, word_count::WordCount};
88
use uucore::hardware::SimdPolicy;
9+
use uucore::i18n::charmap::is_effective_ctype_c_or_posix;
910

1011
use super::WordCountable;
1112

12-
/// Check if the current locale is C or POSIX (where characters == bytes).
13-
/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
14-
pub(crate) fn is_c_or_posix_locale() -> bool {
15-
// Check LC_ALL, LC_CTYPE, and LANG in order of precedence
16-
let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
17-
.iter()
18-
.find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));
19-
20-
if let Some(locale) = locale_val {
21-
// Extract the base locale name (before any '.' or '@')
22-
let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
23-
base_locale == "C" || base_locale == "POSIX"
24-
} else {
25-
// No locale set, default to POSIX behavior (chars == bytes)
26-
true
27-
}
28-
}
29-
3013
use std::io::{self, ErrorKind, Read};
3114

3215
#[cfg(unix)]
@@ -242,7 +225,7 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
242225

243226
// In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
244227
// This follows GNU coreutils behavior.
245-
let chars_are_bytes = is_c_or_posix_locale();
228+
let chars_are_bytes = is_effective_ctype_c_or_posix();
246229

247230
loop {
248231
match handle.read(buf) {

src/uu/wc/src/wc.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser};
2626
use thiserror::Error;
2727
use unicode_width::UnicodeWidthChar;
2828
use utf8::{BufReadDecoder, BufReadDecoderError};
29-
use uucore::{display::Quotable, translate};
29+
use uucore::{display::Quotable, i18n::charmap::is_effective_ctype_c_or_posix, translate};
3030

3131
use uucore::{
3232
error::{FromIo, UError, UResult},
@@ -38,7 +38,7 @@ use uucore::{
3838
};
3939

4040
use crate::{
41-
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
41+
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
4242
countable::WordCountable,
4343
word_count::WordCount,
4444
};
@@ -663,7 +663,7 @@ fn word_count_from_reader_specialized<
663663
let mut in_word = false;
664664
let mut current_len = 0;
665665
let is_posixly_correct = *IS_POSIXLY_CORRECT;
666-
let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
666+
let chars_are_bytes = SHOW_CHARS && is_effective_ctype_c_or_posix();
667667
while let Some(chunk) = reader.next_strict() {
668668
match chunk {
669669
Ok(text) => {

src/uucore/src/lib/features/i18n/charmap.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
1010
use std::sync::OnceLock;
1111

12+
const CTYPE_LOCALE_VARS: [&str; 3] = ["LC_ALL", "LC_CTYPE", "LANG"];
13+
1214
enum MbEncoding {
1315
Utf8,
1416
Gb18030,
@@ -27,12 +29,29 @@ fn encoding_from_name(enc: &str) -> MbEncoding {
2729
}
2830
}
2931

32+
/// Return the effective `LC_CTYPE` locale value from the environment.
33+
///
34+
/// Empty values are ignored, matching the locale precedence used for character
35+
/// map detection.
36+
pub fn get_effective_ctype_locale() -> Option<String> {
37+
CTYPE_LOCALE_VARS
38+
.iter()
39+
.find_map(|&key| std::env::var(key).ok().filter(|v| !v.is_empty()))
40+
}
41+
42+
/// Return whether the effective `LC_CTYPE` locale is the byte-oriented C/POSIX locale.
43+
///
44+
/// A missing effective locale defaults to POSIX behavior. Only exact `C` and
45+
/// `POSIX` locale values are treated as explicit C/POSIX locales; locales such
46+
/// as `C.UTF-8` are not.
47+
pub fn is_effective_ctype_c_or_posix() -> bool {
48+
get_effective_ctype_locale().is_none_or(|locale| locale == "C" || locale == "POSIX")
49+
}
50+
3051
fn get_encoding() -> &'static MbEncoding {
3152
static ENCODING: OnceLock<MbEncoding> = OnceLock::new();
3253
ENCODING.get_or_init(|| {
33-
let val = ["LC_ALL", "LC_CTYPE", "LANG"]
34-
.iter()
35-
.find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
54+
let val = get_effective_ctype_locale();
3655
let s = match val.as_deref() {
3756
Some(s) if s != "C" && s != "POSIX" => s,
3857
_ => return MbEncoding::Utf8,

0 commit comments

Comments
 (0)