@@ -38,15 +38,15 @@ const (
3838 lex_XCONST // Hex string constant (X'...')
3939 lex_USCONST // Unicode string constant (U&'...')
4040 lex_IDENT
41- lex_UIDENT // Unicode identifier (U&"...")
41+ lex_UIDENT // Unicode identifier (U&"...")
4242 lex_TYPECAST // ::
4343 lex_DOT_DOT // ..
4444 lex_COLON_EQUALS // :=
4545 lex_EQUALS_GREATER // =>
4646 lex_LESS_EQUALS // <=
4747 lex_GREATER_EQUALS // >=
4848 lex_NOT_EQUALS // <> or !=
49- lex_PARAM // $1, $2, etc.
49+ lex_PARAM // $1, $2, etc.
5050 lex_Op
5151)
5252
@@ -55,34 +55,34 @@ type LexerState int
5555
5656const (
5757 stateInitial LexerState = iota
58- stateXB // bit string literal
59- stateXC // extended C-style comments
60- stateXD // delimited identifiers (double-quoted)
61- stateXH // hexadecimal byte string
62- stateXQ // standard quoted strings
63- stateXQS // quote stop (detect continued strings)
64- stateXE // extended quoted strings (backslash escapes)
65- stateXDOLQ // dollar-quoted strings
66- stateXUI // quoted identifier with Unicode escapes
67- stateXUS // quoted string with Unicode escapes
68- stateXEU // Unicode surrogate pair in extended quoted string
58+ stateXB // bit string literal
59+ stateXC // extended C-style comments
60+ stateXD // delimited identifiers (double-quoted)
61+ stateXH // hexadecimal byte string
62+ stateXQ // standard quoted strings
63+ stateXQS // quote stop (detect continued strings)
64+ stateXE // extended quoted strings (backslash escapes)
65+ stateXDOLQ // dollar-quoted strings
66+ stateXUI // quoted identifier with Unicode escapes
67+ stateXUS // quoted string with Unicode escapes
68+ stateXEU // Unicode surrogate pair in extended quoted string
6969)
7070
7171// Token represents a lexical token.
7272type Token struct {
73- Type int // Token type (IDENT, ICONST, keyword token, etc.)
74- Str string // String value for identifiers, operators, string literals
75- Ival int64 // Integer value for ICONST
76- Loc int // Byte offset in the source text
73+ Type int // Token type (IDENT, ICONST, keyword token, etc.)
74+ Str string // String value for identifiers, operators, string literals
75+ Ival int64 // Integer value for ICONST
76+ Loc int // Byte offset in the source text
7777}
7878
7979// Lexer implements a PostgreSQL-compatible SQL lexer.
8080type Lexer struct {
81- input string // Input SQL text
82- pos int // Current position in input (byte offset)
83- start int // Start position of current token
81+ input string // Input SQL text
82+ pos int // Current position in input (byte offset)
83+ start int // Start position of current token
8484
85- state LexerState // Current lexer state
85+ state LexerState // Current lexer state
8686 stateBeforeStrStop LexerState // State before entering xqs
8787
8888 // Literal buffer for building string/identifier values
@@ -606,7 +606,70 @@ func (l *Lexer) lexQuoteContinue() Token {
606606 case stateXQ , stateXE :
607607 return Token {Type : lex_SCONST , Str : str , Loc : l .start }
608608 case stateXUS :
609- return Token {Type : lex_USCONST , Str : str , Loc : l .start }
609+ // Unicode string: U&'...'
610+ // Check for UESCAPE clause
611+ escapeChar := '\\'
612+
613+ // Look ahead for UESCAPE 'x'
614+ // We need to skip whitespace/comments first
615+ savedPos := l .pos
616+ l .skipWhitespaceAndComments ()
617+
618+ // Check for UESCAPE keyword (case insensitive)
619+ if l .pos + 7 <= len (l .input ) {
620+ word := l .input [l .pos : l .pos + 7 ]
621+ if strings .EqualFold (word , "UESCAPE" ) {
622+ l .pos += 7
623+ l .skipWhitespaceAndComments ()
624+ // Expect single quoted string
625+ if l .pos < len (l .input ) && l .input [l .pos ] == '\'' {
626+ l .pos ++
627+ // Get escape char
628+ if l .pos < len (l .input ) {
629+ ch := l .input [l .pos ]
630+ escapeChar = rune (ch )
631+ // Verify single char
632+ // We need to handle potential escaped quote or just single char
633+ // Postgres allows 'x' or '' (empty means no escape, but that's handled during decoding maybe? No, 'no escape' means backslash is literal)
634+ // Actually '' is "no escape char".
635+
636+ // Basic check: consume char
637+ l .pos ++
638+ // Handle quoted quote ''
639+ if ch == '\'' && l .pos < len (l .input ) && l .input [l .pos ] == '\'' {
640+ // It was '' inside '...' -> literal quote as escape char
641+ escapeChar = '\''
642+ l .pos ++
643+ }
644+
645+ // Must end with quote
646+ if l .pos < len (l .input ) && l .input [l .pos ] == '\'' {
647+ l .pos ++
648+ } else {
649+ // Invalid UESCAPE clause, rollback
650+ l .pos = savedPos
651+ }
652+ } else {
653+ l .pos = savedPos
654+ }
655+ } else {
656+ l .pos = savedPos
657+ }
658+ } else {
659+ l .pos = savedPos
660+ }
661+ } else {
662+ l .pos = savedPos
663+ }
664+
665+ // Decode the string using the determined escape char
666+ decoded , err := l .decodeUnicodeString (str , escapeChar )
667+ if err != nil {
668+ l .Err = err
669+ return Token {Type : lex_EOF , Loc : l .start }
670+ }
671+
672+ return Token {Type : lex_USCONST , Str : decoded , Loc : l .start }
610673 default :
611674 return Token {Type : lex_SCONST , Str : str , Loc : l .start }
612675 }
@@ -1119,3 +1182,104 @@ func surrogateToCodepoint(high, low rune) rune {
11191182func isValidUnicodeCodepoint (r rune ) bool {
11201183 return r <= unicode .MaxRune && ! isUTF16SurrogateFirst (r ) && ! isUTF16SurrogateSecond (r )
11211184}
1185+
1186+ func isSpaceByte (c byte ) bool {
1187+ return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
1188+ }
1189+
1190+ // skipWhitespaceAndComments skips whitespace and comments (both -- and /* */).
1191+ func (l * Lexer ) skipWhitespaceAndComments () {
1192+ for l .pos < len (l .input ) {
1193+ ch := l .input [l .pos ]
1194+
1195+ if isSpaceByte (ch ) {
1196+ l .pos ++
1197+ continue
1198+ }
1199+
1200+ if ch == '-' && l .pos + 1 < len (l .input ) && l .input [l .pos + 1 ] == '-' {
1201+ l .skipLineComment ()
1202+ continue
1203+ }
1204+
1205+ if ch == '/' && l .pos + 1 < len (l .input ) && l .input [l .pos + 1 ] == '*' {
1206+ l .pos += 2
1207+ depth := 1
1208+ for l .pos < len (l .input ) && depth > 0 {
1209+ if l .pos >= len (l .input ) {
1210+ break
1211+ }
1212+ if l .input [l .pos ] == '*' && l .pos + 1 < len (l .input ) && l .input [l .pos + 1 ] == '/' {
1213+ depth --
1214+ l .pos += 2
1215+ } else if l .input [l .pos ] == '/' && l .pos + 1 < len (l .input ) && l .input [l .pos + 1 ] == '*' {
1216+ depth ++
1217+ l .pos += 2
1218+ } else {
1219+ l .pos ++
1220+ }
1221+ }
1222+ continue
1223+ }
1224+
1225+ break
1226+ }
1227+ }
1228+
1229+ // decodeUnicodeString decodes a U& string with the given escape character.
1230+ func (l * Lexer ) decodeUnicodeString (s string , escape rune ) (string , error ) {
1231+ var buf strings.Builder
1232+ runes := []rune (s )
1233+ n := len (runes )
1234+
1235+ for i := 0 ; i < n ; i ++ {
1236+ r := runes [i ]
1237+
1238+ if r == escape {
1239+ if i + 1 >= n {
1240+ return "" , fmt .Errorf ("invalid Unicode escape sequence at end of string" )
1241+ }
1242+
1243+ next := runes [i + 1 ]
1244+
1245+ // Escaped escape char
1246+ if next == escape {
1247+ buf .WriteRune (escape )
1248+ i ++
1249+ continue
1250+ }
1251+
1252+ // + indicates 6-digit hex
1253+ if next == '+' {
1254+ if i + 8 > n {
1255+ return "" , fmt .Errorf ("invalid Unicode escape sequence" )
1256+ }
1257+ hexStr := string (runes [i + 2 : i + 8 ])
1258+ val , err := strconv .ParseInt (hexStr , 16 , 32 )
1259+ if err != nil {
1260+ return "" , fmt .Errorf ("invalid Unicode escape sequence: %v" , err )
1261+ }
1262+ buf .WriteRune (rune (val ))
1263+ i += 7
1264+ continue
1265+ }
1266+
1267+ // Otherwise 4-digit hex
1268+ if i + 5 > n {
1269+ return "" , fmt .Errorf ("invalid Unicode escape sequence" )
1270+ }
1271+ hexStr := string (runes [i + 1 : i + 5 ])
1272+ val , err := strconv .ParseInt (hexStr , 16 , 32 )
1273+ if err != nil {
1274+ return "" , fmt .Errorf ("invalid Unicode escape sequence: %v" , err )
1275+ }
1276+ buf .WriteRune (rune (val ))
1277+ i += 4
1278+ continue
1279+ }
1280+
1281+ buf .WriteRune (r )
1282+ }
1283+
1284+ return buf .String (), nil
1285+ }
0 commit comments