Skip to content

Commit 341e906

Browse files
committed
Add support for Unicode escape sequences in the \u{XXXXXX} format
1 parent 087698e commit 341e906

3 files changed

Lines changed: 74 additions & 0 deletions

File tree

parser/lexer/lexer.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,37 @@ func (l *Lexer) scanEscape(quote rune) rune {
208208
case 'x':
209209
ch = l.scanDigits(l.next(), 16, 2)
210210
case 'u':
211+
// Support variable-length form: \u{XXXXXX}
212+
if l.peek() == '{' {
213+
// consume '{'
214+
l.next()
215+
// read 1-6 hex digits
216+
digits := 0
217+
for {
218+
p := l.peek()
219+
if p == '}' {
220+
break
221+
}
222+
if digitVal(p) >= 16 {
223+
l.error("invalid char escape")
224+
return eof
225+
}
226+
if digits >= 6 {
227+
l.error("invalid char escape")
228+
return eof
229+
}
230+
l.next()
231+
digits++
232+
}
233+
if l.peek() != '}' || digits == 0 {
234+
l.error("invalid char escape")
235+
return eof
236+
}
237+
// consume '}' and continue
238+
l.next()
239+
ch = l.next()
240+
break
241+
}
211242
ch = l.scanDigits(l.next(), 16, 4)
212243
case 'U':
213244
ch = l.scanDigits(l.next(), 16, 8)

parser/lexer/lexer_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,14 @@ func TestLex(t *testing.T) {
291291
{Kind: EOF},
292292
},
293293
},
294+
{
295+
"\"\\u{61}\\u{1F600}\" '\\u{61}\\u{1F600}'",
296+
[]Token{
297+
{Kind: String, Value: "a😀"},
298+
{Kind: String, Value: "a😀"},
299+
{Kind: EOF},
300+
},
301+
},
294302
}
295303

296304
for _, test := range tests {

parser/lexer/utils.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,41 @@ func unescapeChar(s string) (value rune, multibyte bool, tail string, err error)
111111

112112
// 4. Unicode escape sequences, reproduced from `strconv/quote.go`
113113
case 'x', 'X', 'u', 'U':
114+
// Support Go/Rust-style variable-length form: \u{XXXXXX}
115+
if c == 'u' && len(s) > 0 && s[0] == '{' {
116+
// consume '{'
117+
s = s[1:]
118+
var v rune
119+
digits := 0
120+
for len(s) > 0 && s[0] != '}' {
121+
x, ok := unhex(s[0])
122+
if !ok {
123+
err = fmt.Errorf("unable to unescape string")
124+
return
125+
}
126+
if digits >= 6 { // at most 6 hex digits
127+
err = fmt.Errorf("unable to unescape string")
128+
return
129+
}
130+
v = v<<4 | x
131+
s = s[1:]
132+
digits++
133+
}
134+
// require closing '}' and at least 1 digit
135+
if len(s) == 0 || s[0] != '}' || digits == 0 {
136+
err = fmt.Errorf("unable to unescape string")
137+
return
138+
}
139+
// consume '}'
140+
s = s[1:]
141+
if v > utf8.MaxRune {
142+
err = fmt.Errorf("unable to unescape string")
143+
return
144+
}
145+
value = v
146+
multibyte = true
147+
break
148+
}
114149
n := 0
115150
switch c {
116151
case 'x', 'X':

0 commit comments

Comments
 (0)