@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
88use uutests:: new_ucmd;
99use uutests:: util:: vec_of_size;
1010
11- // spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
11+ // spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
12+ // spell-checker:ignore (Vietnamese) Tiếng Việt chào
1213#[ test]
1314fn test_invalid_arg ( ) {
1415 new_ucmd ! ( ) . arg ( "--definitely-invalid" ) . fails_with_code ( 1 ) ;
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
6162
6263#[ test]
6364fn test_utf8 ( ) {
65+ // Requires UTF-8 locale for character counting
6466 new_ucmd ! ( )
6567 . args ( & [ "-lwmcL" ] )
68+ . env ( "LC_ALL" , "en_US.UTF-8" )
6669 . pipe_in_fixture ( "UTF_8_test.txt" )
6770 . succeeds ( )
6871 . stdout_is ( " 303 2178 22457 23025 79\n " ) ;
@@ -88,35 +91,43 @@ fn test_utf8_line_length_words() {
8891
8992#[ test]
9093fn test_utf8_line_length_chars ( ) {
94+ // Requires UTF-8 locale for character counting
9195 new_ucmd ! ( )
9296 . arg ( "-Lm" )
97+ . env ( "LC_ALL" , "en_US.UTF-8" )
9398 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
9499 . succeeds ( )
95100 . stdout_is ( " 442 48\n " ) ;
96101}
97102
98103#[ test]
99104fn test_utf8_line_length_chars_words ( ) {
105+ // Requires UTF-8 locale for character counting
100106 new_ucmd ! ( )
101107 . arg ( "-Lmw" )
108+ . env ( "LC_ALL" , "en_US.UTF-8" )
102109 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
103110 . succeeds ( )
104111 . stdout_is ( " 89 442 48\n " ) ;
105112}
106113
107114#[ test]
108115fn test_utf8_chars ( ) {
116+ // Requires UTF-8 locale for character counting
109117 new_ucmd ! ( )
110118 . arg ( "-m" )
119+ . env ( "LC_ALL" , "en_US.UTF-8" )
111120 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
112121 . succeeds ( )
113122 . stdout_is ( "442\n " ) ;
114123}
115124
116125#[ test]
117126fn test_utf8_bytes_chars ( ) {
127+ // Requires UTF-8 locale for character counting
118128 new_ucmd ! ( )
119129 . arg ( "-cm" )
130+ . env ( "LC_ALL" , "en_US.UTF-8" )
120131 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
121132 . succeeds ( )
122133 . stdout_is ( " 442 513\n " ) ;
@@ -133,17 +144,21 @@ fn test_utf8_bytes_lines() {
133144
134145#[ test]
135146fn test_utf8_bytes_chars_lines ( ) {
147+ // Requires UTF-8 locale for character counting
136148 new_ucmd ! ( )
137149 . arg ( "-cml" )
150+ . env ( "LC_ALL" , "en_US.UTF-8" )
138151 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
139152 . succeeds ( )
140153 . stdout_is ( " 25 442 513\n " ) ;
141154}
142155
143156#[ test]
144157fn test_utf8_chars_words ( ) {
158+ // Requires UTF-8 locale for character counting
145159 new_ucmd ! ( )
146160 . arg ( "-mw" )
161+ . env ( "LC_ALL" , "en_US.UTF-8" )
147162 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
148163 . succeeds ( )
149164 . stdout_is ( " 89 442\n " ) ;
@@ -169,35 +184,43 @@ fn test_utf8_line_length_lines_words() {
169184
170185#[ test]
171186fn test_utf8_lines_chars ( ) {
187+ // Requires UTF-8 locale for character counting
172188 new_ucmd ! ( )
173189 . arg ( "-ml" )
190+ . env ( "LC_ALL" , "en_US.UTF-8" )
174191 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
175192 . succeeds ( )
176193 . stdout_is ( " 25 442\n " ) ;
177194}
178195
179196#[ test]
180197fn test_utf8_lines_words_chars ( ) {
198+ // Requires UTF-8 locale for character counting
181199 new_ucmd ! ( )
182200 . arg ( "-mlw" )
201+ . env ( "LC_ALL" , "en_US.UTF-8" )
183202 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
184203 . succeeds ( )
185204 . stdout_is ( " 25 89 442\n " ) ;
186205}
187206
188207#[ test]
189208fn test_utf8_line_length_lines_chars ( ) {
209+ // Requires UTF-8 locale for character counting
190210 new_ucmd ! ( )
191211 . arg ( "-Llm" )
212+ . env ( "LC_ALL" , "en_US.UTF-8" )
192213 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
193214 . succeeds ( )
194215 . stdout_is ( " 25 442 48\n " ) ;
195216}
196217
197218#[ test]
198219fn test_utf8_all ( ) {
220+ // Requires UTF-8 locale for character counting
199221 new_ucmd ! ( )
200222 . arg ( "-lwmcL" )
223+ . env ( "LC_ALL" , "en_US.UTF-8" )
201224 . pipe_in_fixture ( "UTF_8_weirdchars.txt" )
202225 . succeeds ( )
203226 . stdout_is ( " 25 89 442 513 48\n " ) ;
@@ -921,3 +944,136 @@ fn test_posixly_correct_whitespace() {
921944 . succeeds ( )
922945 . stdout_is ( "1\n " ) ;
923946}
947+
948+ #[ test]
949+ fn test_wc_chars_c_locale ( ) {
950+ // In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
951+ // Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
952+ // "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
953+ let vietnamese_text = "Tiếng" ;
954+
955+ // With LC_ALL=C, chars should equal bytes (7)
956+ new_ucmd ! ( )
957+ . arg ( "-m" )
958+ . env ( "LC_ALL" , "C" )
959+ . pipe_in ( vietnamese_text)
960+ . succeeds ( )
961+ . stdout_is ( "7\n " ) ;
962+
963+ // Same with LC_ALL=POSIX
964+ new_ucmd ! ( )
965+ . arg ( "-m" )
966+ . env ( "LC_ALL" , "POSIX" )
967+ . pipe_in ( vietnamese_text)
968+ . succeeds ( )
969+ . stdout_is ( "7\n " ) ;
970+
971+ // Test combined with bytes flag - should show same count
972+ new_ucmd ! ( )
973+ . args ( & [ "-cm" ] )
974+ . env ( "LC_ALL" , "C" )
975+ . pipe_in ( vietnamese_text)
976+ . succeeds ( )
977+ . stdout_is ( " 7 7\n " ) ;
978+ }
979+
980+ #[ test]
981+ fn test_wc_chars_utf8_locale ( ) {
982+ // In UTF-8 locale, wc -m should count UTF-8 characters
983+ // Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
984+ let vietnamese_text = "Tiếng" ;
985+
986+ // With vi_VN.UTF-8 locale, chars should be 5 (not 7)
987+ new_ucmd ! ( )
988+ . arg ( "-m" )
989+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
990+ . pipe_in ( vietnamese_text)
991+ . succeeds ( )
992+ . stdout_is ( "5\n " ) ;
993+
994+ // Test combined with bytes flag - should show different counts
995+ // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
996+ new_ucmd ! ( )
997+ . args ( & [ "-cm" ] )
998+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
999+ . pipe_in ( vietnamese_text)
1000+ . succeeds ( )
1001+ . stdout_is ( " 5 7\n " ) ;
1002+ }
1003+
1004+ #[ test]
1005+ fn test_wc_chars_default_locale ( ) {
1006+ // When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
1007+ // This ensures backward compatibility
1008+ let vietnamese_text = "Tiếng" ;
1009+
1010+ new_ucmd ! ( )
1011+ . arg ( "-m" )
1012+ . env ( "LC_ALL" , "" )
1013+ . env ( "LC_CTYPE" , "" )
1014+ . env ( "LANG" , "" )
1015+ . pipe_in ( vietnamese_text)
1016+ . succeeds ( )
1017+ . stdout_is ( "7\n " ) ;
1018+ }
1019+
1020+ #[ test]
1021+ fn test_wc_multibyte_c_locale ( ) {
1022+ // Issue #9712 and #5831: Test various multibyte characters in C locale
1023+ // All should be counted as bytes
1024+
1025+ // Vietnamese text with multiple diacritics: "Tiếng Việt"
1026+ // 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1027+ new_ucmd ! ( )
1028+ . args ( & [ "-cm" ] )
1029+ . env ( "LC_ALL" , "C" )
1030+ . pipe_in ( "Tiếng Việt" )
1031+ . succeeds ( )
1032+ . stdout_is ( " 14 14\n " ) ;
1033+
1034+ // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1035+ new_ucmd ! ( )
1036+ . args ( & [ "-cm" ] )
1037+ . env ( "LC_ALL" , "C" )
1038+ . pipe_in ( "ệ" )
1039+ . succeeds ( )
1040+ . stdout_is ( " 3 3\n " ) ;
1041+
1042+ // Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
1043+ new_ucmd ! ( )
1044+ . args ( & [ "-cm" ] )
1045+ . env ( "LC_ALL" , "C" )
1046+ . pipe_in ( "Xin chào" )
1047+ . succeeds ( )
1048+ . stdout_is ( " 9 9\n " ) ;
1049+ }
1050+
1051+ #[ test]
1052+ fn test_wc_multibyte_utf8_locale ( ) {
1053+ // In UTF-8 locale, multibyte characters should be counted correctly
1054+ // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
1055+
1056+ // Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
1057+ new_ucmd ! ( )
1058+ . args ( & [ "-cm" ] )
1059+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1060+ . pipe_in ( "Tiếng Việt" )
1061+ . succeeds ( )
1062+ . stdout_is ( " 10 14\n " ) ;
1063+
1064+ // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
1065+ new_ucmd ! ( )
1066+ . args ( & [ "-cm" ] )
1067+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1068+ . pipe_in ( "ệ" )
1069+ . succeeds ( )
1070+ . stdout_is ( " 1 3\n " ) ;
1071+
1072+ // Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
1073+ new_ucmd ! ( )
1074+ . args ( & [ "-cm" ] )
1075+ . env ( "LC_ALL" , "vi_VN.UTF-8" )
1076+ . pipe_in ( "Xin chào" )
1077+ . succeeds ( )
1078+ . stdout_is ( " 8 9\n " ) ;
1079+ }
0 commit comments