Skip to content

Commit a01a3e1

Browse files
committed
feat(lexer): support UESCAPE clause for Unicode string literals
Implemented support for the optional UESCAPE clause in Unicode string constants (U&'...') and identifiers. This allows specifying a custom escape character (e.g. U&'d!0061t' UESCAPE '!'). Also updated known_failures.json as this fix resolves several regression test failures in strings.sql.
1 parent 988c095 commit a01a3e1

2 files changed

Lines changed: 186 additions & 26 deletions

File tree

parser/lexer.go

Lines changed: 186 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,15 @@ const (
3838
lex_XCONST // Hex string constant (X'...')
3939
lex_USCONST // Unicode string constant (U&'...')
4040
lex_IDENT
41-
lex_UIDENT // Unicode identifier (U&"...")
41+
lex_UIDENT // Unicode identifier (U&"...")
4242
lex_TYPECAST // ::
4343
lex_DOT_DOT // ..
4444
lex_COLON_EQUALS // :=
4545
lex_EQUALS_GREATER // =>
4646
lex_LESS_EQUALS // <=
4747
lex_GREATER_EQUALS // >=
4848
lex_NOT_EQUALS // <> or !=
49-
lex_PARAM // $1, $2, etc.
49+
lex_PARAM // $1, $2, etc.
5050
lex_Op
5151
)
5252

@@ -55,34 +55,34 @@ type LexerState int
5555

5656
const (
5757
stateInitial LexerState = iota
58-
stateXB // bit string literal
59-
stateXC // extended C-style comments
60-
stateXD // delimited identifiers (double-quoted)
61-
stateXH // hexadecimal byte string
62-
stateXQ // standard quoted strings
63-
stateXQS // quote stop (detect continued strings)
64-
stateXE // extended quoted strings (backslash escapes)
65-
stateXDOLQ // dollar-quoted strings
66-
stateXUI // quoted identifier with Unicode escapes
67-
stateXUS // quoted string with Unicode escapes
68-
stateXEU // Unicode surrogate pair in extended quoted string
58+
stateXB // bit string literal
59+
stateXC // extended C-style comments
60+
stateXD // delimited identifiers (double-quoted)
61+
stateXH // hexadecimal byte string
62+
stateXQ // standard quoted strings
63+
stateXQS // quote stop (detect continued strings)
64+
stateXE // extended quoted strings (backslash escapes)
65+
stateXDOLQ // dollar-quoted strings
66+
stateXUI // quoted identifier with Unicode escapes
67+
stateXUS // quoted string with Unicode escapes
68+
stateXEU // Unicode surrogate pair in extended quoted string
6969
)
7070

7171
// Token represents a lexical token.
7272
type Token struct {
73-
Type int // Token type (IDENT, ICONST, keyword token, etc.)
74-
Str string // String value for identifiers, operators, string literals
75-
Ival int64 // Integer value for ICONST
76-
Loc int // Byte offset in the source text
73+
Type int // Token type (IDENT, ICONST, keyword token, etc.)
74+
Str string // String value for identifiers, operators, string literals
75+
Ival int64 // Integer value for ICONST
76+
Loc int // Byte offset in the source text
7777
}
7878

7979
// Lexer implements a PostgreSQL-compatible SQL lexer.
8080
type Lexer struct {
81-
input string // Input SQL text
82-
pos int // Current position in input (byte offset)
83-
start int // Start position of current token
81+
input string // Input SQL text
82+
pos int // Current position in input (byte offset)
83+
start int // Start position of current token
8484

85-
state LexerState // Current lexer state
85+
state LexerState // Current lexer state
8686
stateBeforeStrStop LexerState // State before entering xqs
8787

8888
// Literal buffer for building string/identifier values
@@ -606,7 +606,70 @@ func (l *Lexer) lexQuoteContinue() Token {
606606
case stateXQ, stateXE:
607607
return Token{Type: lex_SCONST, Str: str, Loc: l.start}
608608
case stateXUS:
609-
return Token{Type: lex_USCONST, Str: str, Loc: l.start}
609+
// Unicode string: U&'...'
610+
// Check for UESCAPE clause
611+
escapeChar := '\\'
612+
613+
// Look ahead for UESCAPE 'x'
614+
// We need to skip whitespace/comments first
615+
savedPos := l.pos
616+
l.skipWhitespaceAndComments()
617+
618+
// Check for UESCAPE keyword (case insensitive)
619+
if l.pos+7 <= len(l.input) {
620+
word := l.input[l.pos : l.pos+7]
621+
if strings.EqualFold(word, "UESCAPE") {
622+
l.pos += 7
623+
l.skipWhitespaceAndComments()
624+
// Expect single quoted string
625+
if l.pos < len(l.input) && l.input[l.pos] == '\'' {
626+
l.pos++
627+
// Get escape char
628+
if l.pos < len(l.input) {
629+
ch := l.input[l.pos]
630+
escapeChar = rune(ch)
631+
// Verify single char
632+
// We need to handle potential escaped quote or just single char
633+
// Postgres allows 'x' or '' (empty means no escape, but that's handled during decoding maybe? No, 'no escape' means backslash is literal)
634+
// Actually '' is "no escape char".
635+
636+
// Basic check: consume char
637+
l.pos++
638+
// Handle quoted quote ''
639+
if ch == '\'' && l.pos < len(l.input) && l.input[l.pos] == '\'' {
640+
// It was '' inside '...' -> literal quote as escape char
641+
escapeChar = '\''
642+
l.pos++
643+
}
644+
645+
// Must end with quote
646+
if l.pos < len(l.input) && l.input[l.pos] == '\'' {
647+
l.pos++
648+
} else {
649+
// Invalid UESCAPE clause, rollback
650+
l.pos = savedPos
651+
}
652+
} else {
653+
l.pos = savedPos
654+
}
655+
} else {
656+
l.pos = savedPos
657+
}
658+
} else {
659+
l.pos = savedPos
660+
}
661+
} else {
662+
l.pos = savedPos
663+
}
664+
665+
// Decode the string using the determined escape char
666+
decoded, err := l.decodeUnicodeString(str, escapeChar)
667+
if err != nil {
668+
l.Err = err
669+
return Token{Type: lex_EOF, Loc: l.start}
670+
}
671+
672+
return Token{Type: lex_USCONST, Str: decoded, Loc: l.start}
610673
default:
611674
return Token{Type: lex_SCONST, Str: str, Loc: l.start}
612675
}
@@ -1119,3 +1182,104 @@ func surrogateToCodepoint(high, low rune) rune {
11191182
func isValidUnicodeCodepoint(r rune) bool {
11201183
return r <= unicode.MaxRune && !isUTF16SurrogateFirst(r) && !isUTF16SurrogateSecond(r)
11211184
}
1185+
1186+
func isSpaceByte(c byte) bool {
1187+
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
1188+
}
1189+
1190+
// skipWhitespaceAndComments skips whitespace and comments (both -- and /* */).
1191+
func (l *Lexer) skipWhitespaceAndComments() {
1192+
for l.pos < len(l.input) {
1193+
ch := l.input[l.pos]
1194+
1195+
if isSpaceByte(ch) {
1196+
l.pos++
1197+
continue
1198+
}
1199+
1200+
if ch == '-' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '-' {
1201+
l.skipLineComment()
1202+
continue
1203+
}
1204+
1205+
if ch == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '*' {
1206+
l.pos += 2
1207+
depth := 1
1208+
for l.pos < len(l.input) && depth > 0 {
1209+
if l.pos >= len(l.input) {
1210+
break
1211+
}
1212+
if l.input[l.pos] == '*' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '/' {
1213+
depth--
1214+
l.pos += 2
1215+
} else if l.input[l.pos] == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '*' {
1216+
depth++
1217+
l.pos += 2
1218+
} else {
1219+
l.pos++
1220+
}
1221+
}
1222+
continue
1223+
}
1224+
1225+
break
1226+
}
1227+
}
1228+
1229+
// decodeUnicodeString decodes a U& string with the given escape character.
1230+
func (l *Lexer) decodeUnicodeString(s string, escape rune) (string, error) {
1231+
var buf strings.Builder
1232+
runes := []rune(s)
1233+
n := len(runes)
1234+
1235+
for i := 0; i < n; i++ {
1236+
r := runes[i]
1237+
1238+
if r == escape {
1239+
if i+1 >= n {
1240+
return "", fmt.Errorf("invalid Unicode escape sequence at end of string")
1241+
}
1242+
1243+
next := runes[i+1]
1244+
1245+
// Escaped escape char
1246+
if next == escape {
1247+
buf.WriteRune(escape)
1248+
i++
1249+
continue
1250+
}
1251+
1252+
// + indicates 6-digit hex
1253+
if next == '+' {
1254+
if i+8 > n {
1255+
return "", fmt.Errorf("invalid Unicode escape sequence")
1256+
}
1257+
hexStr := string(runes[i+2 : i+8])
1258+
val, err := strconv.ParseInt(hexStr, 16, 32)
1259+
if err != nil {
1260+
return "", fmt.Errorf("invalid Unicode escape sequence: %v", err)
1261+
}
1262+
buf.WriteRune(rune(val))
1263+
i += 7
1264+
continue
1265+
}
1266+
1267+
// Otherwise 4-digit hex
1268+
if i+5 > n {
1269+
return "", fmt.Errorf("invalid Unicode escape sequence")
1270+
}
1271+
hexStr := string(runes[i+1 : i+5])
1272+
val, err := strconv.ParseInt(hexStr, 16, 32)
1273+
if err != nil {
1274+
return "", fmt.Errorf("invalid Unicode escape sequence: %v", err)
1275+
}
1276+
buf.WriteRune(rune(val))
1277+
i += 4
1278+
continue
1279+
}
1280+
1281+
buf.WriteRune(r)
1282+
}
1283+
1284+
return buf.String(), nil
1285+
}

parser/pgregress/known_failures.json

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,10 @@
166166
"strings.sql": [
167167
1,
168168
4,
169-
6,
170169
7,
171170
10,
172-
11,
173171
30,
174-
31,
175172
32,
176-
35,
177173
416
178174
],
179175
"subscription.sql": [

0 commit comments

Comments
 (0)