@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
88use uutests:: new_ucmd;
99use uutests:: util:: vec_of_size;
1010
11- // spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
11+ // spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
12+ // spell-checker:ignore (Vietnamese) Tiếng Việt chào
1213#[ test]
1314fn test_invalid_arg ( ) {
1415 new_ucmd ! ( ) . arg ( "--definitely-invalid" ) . fails_with_code ( 1 ) ;
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
6162
6263#[ test]
6364fn test_utf8 ( ) {
65+ // Requires UTF-8 locale for character counting
6466 new_ucmd ! ( )
6567 . args ( & [ "-lwmcL" ] )
68+ . env ( "LC_ALL" , "en_US.UTF-8" )
6669 . pipe_in_fixture ( "UTF_8_test.txt" )
6770 . succeeds ( )
6871 . stdout_is ( " 303 2178 22457 23025 79\n " ) ;
@@ -88,35 +91,43 @@ fn test_utf8_line_length_words() {
8891
8992#[ test]
9093fn test_utf8_line_length_chars ( ) {
94+ // Requires UTF-8 locale for character counting
9195 new_ucmd ! ( )
9296 . arg ( "-Lm" )
97+ . env ( "LC_ALL" , "en_US.UTF-8" )
9398 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
9499 . succeeds ( )
95100 . stdout_is ( " 442 48\n " ) ;
96101}
97102
98103#[ test]
99104fn test_utf8_line_length_chars_words ( ) {
105+ // Requires UTF-8 locale for character counting
100106 new_ucmd ! ( )
101107 . arg ( "-Lmw" )
108+ . env ( "LC_ALL" , "en_US.UTF-8" )
102109 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
103110 . succeeds ( )
104111 . stdout_is ( " 89 442 48\n " ) ;
105112}
106113
107114#[ test]
108115fn test_utf8_chars ( ) {
116+ // Requires UTF-8 locale for character counting
109117 new_ucmd ! ( )
110118 . arg ( "-m" )
119+ . env ( "LC_ALL" , "en_US.UTF-8" )
111120 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
112121 . succeeds ( )
113122 . stdout_is ( "442\n " ) ;
114123}
115124
116125#[ test]
117126fn test_utf8_bytes_chars ( ) {
127+ // Requires UTF-8 locale for character counting
118128 new_ucmd ! ( )
119129 . arg ( "-cm" )
130+ . env ( "LC_ALL" , "en_US.UTF-8" )
120131 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
121132 . succeeds ( )
122133 . stdout_is ( " 442 513\n " ) ;
@@ -133,17 +144,21 @@ fn test_utf8_bytes_lines() {
133144
134145#[ test]
135146fn test_utf8_bytes_chars_lines ( ) {
147+ // Requires UTF-8 locale for character counting
136148 new_ucmd ! ( )
137149 . arg ( "-cml" )
150+ . env ( "LC_ALL" , "en_US.UTF-8" )
138151 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
139152 . succeeds ( )
140153 . stdout_is ( " 25 442 513\n " ) ;
141154}
142155
143156#[ test]
144157fn test_utf8_chars_words ( ) {
158+ // Requires UTF-8 locale for character counting
145159 new_ucmd ! ( )
146160 . arg ( "-mw" )
161+ . env ( "LC_ALL" , "en_US.UTF-8" )
147162 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
148163 . succeeds ( )
149164 . stdout_is ( " 89 442\n " ) ;
@@ -169,35 +184,43 @@ fn test_utf8_line_length_lines_words() {
169184
170185#[ test]
171186fn test_utf8_lines_chars ( ) {
187+ // Requires UTF-8 locale for character counting
172188 new_ucmd ! ( )
173189 . arg ( "-ml" )
190+ . env ( "LC_ALL" , "en_US.UTF-8" )
174191 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
175192 . succeeds ( )
176193 . stdout_is ( " 25 442\n " ) ;
177194}
178195
179196#[ test]
180197fn test_utf8_lines_words_chars ( ) {
198+ // Requires UTF-8 locale for character counting
181199 new_ucmd ! ( )
182200 . arg ( "-mlw" )
201+ . env ( "LC_ALL" , "en_US.UTF-8" )
183202 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
184203 . succeeds ( )
185204 . stdout_is ( " 25 89 442\n " ) ;
186205}
187206
188207#[ test]
189208fn test_utf8_line_length_lines_chars ( ) {
209+ // Requires UTF-8 locale for character counting
190210 new_ucmd ! ( )
191211 . arg ( "-Llm" )
212+ . env ( "LC_ALL" , "en_US.UTF-8" )
192213 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
193214 . succeeds ( )
194215 . stdout_is ( " 25 442 48\n " ) ;
195216}
196217
197218#[ test]
198219fn test_utf8_all ( ) {
220+ // Requires UTF-8 locale for character counting
199221 new_ucmd ! ( )
200222 . arg ( "-lwmcL" )
223+ . env ( "LC_ALL" , "en_US.UTF-8" )
201224 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
202225 . succeeds ( )
203226 . stdout_is ( " 25 89 442 513 48\n " ) ;
@@ -958,3 +981,136 @@ fn test_posixly_correct_whitespace() {
958981 . succeeds ( )
959982 . stdout_is ( "1\n " ) ;
960983}
984+
985+ #[ test]
986+ fn test_wc_chars_c_locale ( ) {
987+ // In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
988+ // Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
989+ // "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
990+ let vietnamese_text = "Tiếng" ;
991+
992+ // With LC_ALL=C, chars should equal bytes (7)
993+ new_ucmd ! ( )
994+ . arg ( "-m" )
995+ . env ( "LC_ALL" , "C" )
996+ . pipe_in ( vietnamese_text)
997+ . succeeds ( )
998+ . stdout_is ( "7\n " ) ;
999+
1000+ // Same with LC_ALL=POSIX
1001+ new_ucmd ! ( )
1002+ . arg ( "-m" )
1003+ . env ( "LC_ALL" , "POSIX" )
1004+ . pipe_in ( vietnamese_text)
1005+ . succeeds ( )
1006+ . stdout_is ( "7\n " ) ;
1007+
1008+ // Test combined with bytes flag - should show same count
1009+ new_ucmd ! ( )
1010+ . args ( & [ "-cm" ] )
1011+ . env ( "LC_ALL" , "C" )
1012+ . pipe_in ( vietnamese_text)
1013+ . succeeds ( )
1014+ . stdout_is ( " 7 7\n " ) ;
1015+ }
1016+
1017+ #[ test]
1018+ fn test_wc_chars_utf8_locale ( ) {
1019+ // In UTF-8 locale, wc -m should count UTF-8 characters
1020+ // Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
1021+ let vietnamese_text = "Tiếng" ;
1022+
1023+ // With vi_VN.UTF-8 locale, chars should be 5 (not 7)
1024+ new_ucmd ! ( )
1025+ . arg ( "-m" )
1026+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1027+ . pipe_in ( vietnamese_text)
1028+ . succeeds ( )
1029+ . stdout_is ( "5\n " ) ;
1030+
1031+ // Test combined with bytes flag - should show different counts
1032+ // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1033+ new_ucmd ! ( )
1034+ . args ( & [ "-cm" ] )
1035+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1036+ . pipe_in ( vietnamese_text)
1037+ . succeeds ( )
1038+ . stdout_is ( " 5 7\n " ) ;
1039+ }
1040+
1041+ #[ test]
1042+ fn test_wc_chars_default_locale ( ) {
1043+ // When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
1044+ // This ensures backward compatibility
1045+ let vietnamese_text = "Tiếng" ;
1046+
1047+ new_ucmd ! ( )
1048+ . arg ( "-m" )
1049+ . env ( "LC_ALL" , "" )
1050+ . env ( "LC_CTYPE" , "" )
1051+ . env ( "LANG" , "" )
1052+ . pipe_in ( vietnamese_text)
1053+ . succeeds ( )
1054+ . stdout_is ( "7\n " ) ;
1055+ }
1056+
1057+ #[ test]
1058+ fn test_wc_multibyte_c_locale ( ) {
1059+ // Issue #9712 and #5831: Test various multibyte characters in C locale
1060+ // All should be counted as bytes
1061+
1062+ // Vietnamese text with multiple diacritics: "Tiếng Việt"
1063+ // 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1064+ new_ucmd ! ( )
1065+ . args ( & [ "-cm" ] )
1066+ . env ( "LC_ALL" , "C" )
1067+ . pipe_in ( "Tiếng Việt" )
1068+ . succeeds ( )
1069+ . stdout_is ( " 14 14\n " ) ;
1070+
1071+ // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1072+ new_ucmd ! ( )
1073+ . args ( & [ "-cm" ] )
1074+ . env ( "LC_ALL" , "C" )
1075+ . pipe_in ( "ệ" )
1076+ . succeeds ( )
1077+ . stdout_is ( " 3 3\n " ) ;
1078+
1079+ // Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
1080+ new_ucmd ! ( )
1081+ . args ( & [ "-cm" ] )
1082+ . env ( "LC_ALL" , "C" )
1083+ . pipe_in ( "Xin chào" )
1084+ . succeeds ( )
1085+ . stdout_is ( " 9 9\n " ) ;
1086+ }
1087+
1088+ #[ test]
1089+ fn test_wc_multibyte_utf8_locale ( ) {
1090+ // In UTF-8 locale, multibyte characters should be counted correctly
1091+ // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1092+
1093+ // Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1094+ new_ucmd ! ( )
1095+ . args ( & [ "-cm" ] )
1096+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1097+ . pipe_in ( "Tiếng Việt" )
1098+ . succeeds ( )
1099+ . stdout_is ( " 10 14\n " ) ;
1100+
1101+ // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1102+ new_ucmd ! ( )
1103+ . args ( & [ "-cm" ] )
1104+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1105+ . pipe_in ( "ệ" )
1106+ . succeeds ( )
1107+ . stdout_is ( " 1 3\n " ) ;
1108+
1109+ // Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
1110+ new_ucmd ! ( )
1111+ . args ( & [ "-cm" ] )
1112+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1113+ . pipe_in ( "Xin chào" )
1114+ . succeeds ( )
1115+ . stdout_is ( " 8 9\n " ) ;
1116+ }
0 commit comments