Skip to content

Commit ad82739

Browse files
committed
fix: accept 4-8 char primary language subtags per RFC 5646 §2.1
The switch in isValidBCP47StrictLanguageTag had two dead-end cases: case n == 4: return false default: return false // 5-8 chars land here The regex (group 2) already accepts [A-Z]{4} and [A-Z]{5,8}, so these tags pass syntactic validation but are then rejected by the switch. RFC 5646 §2.1 explicitly allows both: - 4-alpha: reserved for future use - 5-8 alpha: registered language subtags golang.org/x/text/language does not cover 4-8 char primary subtags, so syntactic validation by the regex is sufficient for both cases. Adds a unit test suite for isValidBCP47StrictLanguageTag covering 2-char, 3-char, 4-char, 5-8 char subtags, with and without region/script/extlang, grandfathered tags, private-use tags, and invalid inputs.
1 parent a63759e commit ad82739

File tree

2 files changed

+99
-2
lines changed

2 files changed

+99
-2
lines changed

validators/bcp47.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,12 @@ func isValidBCP47StrictLanguageTag(s string) bool {
117117
return false
118118
}
119119
case n == 4:
120-
return false
120+
// 4-alpha primary subtags are reserved for future use per RFC 5646 §2.1;
121+
// accept them syntactically even though none are currently assigned.
121122
default:
122-
return false
123+
// 5-8 alpha: registered language subtag per RFC 5646 §2.1.
124+
// golang.org/x/text/language does not cover these, so syntactic
125+
// validation by the regex above is sufficient.
123126
}
124127

125128
if script != "" {

validators/bcp47_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package validators
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestIsValidBCP47StrictLanguageTag(t *testing.T) {
8+
tests := []struct {
9+
tag string
10+
valid bool
11+
}{
12+
// 2-char primary subtags (ISO 639-1)
13+
{"en", true},
14+
{"it", true},
15+
{"fr", true},
16+
{"de", true},
17+
18+
// 3-char primary subtags with no ISO 639-1 2-char equivalent —
19+
// these normalize to themselves in golang.org/x/text/language.
20+
// 3-char codes that do have a 2-char form (e.g. "eng"→"en") are
21+
// rejected because the validator requires the canonical form.
22+
{"sgn", true},
23+
{"tlh", true},
24+
{"jbo", true},
25+
26+
// 5-8 char primary subtags (RFC 5646 §2.1 registered language subtag).
27+
// Were incorrectly rejected before this fix.
28+
{"abcde", true}, // 5 chars
29+
{"abcdefg", true}, // 7 chars
30+
{"abcdefgh", true}, // 8 chars
31+
32+
// 4-char primary subtag (reserved for future use per RFC 5646 §2.1)
33+
{"abcd", true},
34+
35+
// With region subtag
36+
{"en-US", true},
37+
{"it-IT", true},
38+
{"en-GB", true},
39+
{"zh-CN", true},
40+
41+
// With script subtag
42+
{"zh-Hant", true},
43+
{"zh-Hans", true},
44+
{"sr-Latn", true},
45+
46+
// With script and region
47+
{"zh-Hant-TW", true},
48+
{"sr-Latn-RS", true},
49+
50+
// With extlang subtag
51+
{"zh-cmn", true},
52+
53+
// Grandfathered irregular tags
54+
{"i-ami", true},
55+
{"i-bnn", true},
56+
{"art-lojban", true},
57+
{"zh-min", true},
58+
59+
// Private use
60+
{"x-private", true},
61+
{"x-12345678", true},
62+
63+
// Empty string
64+
{"", false},
65+
66+
// POSIX-style (underscore separator)
67+
{"en_US", false},
68+
{"en_GB", false},
69+
70+
// Primary subtag too long (> 8 chars)
71+
{"abcdefghi", false},
72+
73+
// Digits in primary subtag position
74+
{"1234", false},
75+
76+
// 3-char code with a 2-char canonical form: requires canonical "en"
77+
{"eng", false},
78+
79+
// Unknown extlang subtag
80+
{"en-xyz", false},
81+
82+
// Invalid region
83+
{"en-ZZZ", false},
84+
}
85+
86+
for _, tt := range tests {
87+
t.Run(tt.tag, func(t *testing.T) {
88+
got := isValidBCP47StrictLanguageTag(tt.tag)
89+
if got != tt.valid {
90+
t.Errorf("isValidBCP47StrictLanguageTag(%q) = %v, want %v", tt.tag, got, tt.valid)
91+
}
92+
})
93+
}
94+
}

0 commit comments

Comments
 (0)