@@ -155,10 +155,19 @@ defmodule Absinthe.Lexer do
155155 ] )
156156 |> post_traverse ( { :labeled_token , [ :float_value ] } )
157157
158- # EscapedUnicode :: /[0-9A-Fa-f]{4}/
159- escaped_unicode =
158+ # EscapedUnicode (Fixed-width) :: /[0-9A-Fa-f]{4}/
159+ # Per GraphQL September 2025 spec, this supports BMP characters and surrogate pairs
160+ escaped_unicode_fixed =
160161 times ( ascii_char ( [ ?0 .. ?9 , ?A .. ?F , ?a .. ?f ] ) , 4 )
161- |> post_traverse ( { :unescape_unicode , [ ] } )
162+ |> post_traverse ( { :unescape_unicode_fixed , [ ] } )
163+
164+ # EscapedUnicode (Variable-width) :: \u{ [0-9A-Fa-f]+ }
165+ # Per GraphQL September 2025 spec, supports full Unicode range U+0000 to U+10FFFF
166+ escaped_unicode_variable =
167+ ignore ( ascii_char ( [ ?{ ] ) )
168+ |> times ( ascii_char ( [ ?0 .. ?9 , ?A .. ?F , ?a .. ?f ] ) , min: 1 )
169+ |> ignore ( ascii_char ( [ ?} ] ) )
170+ |> post_traverse ( { :unescape_unicode_variable , [ ] } )
162171
163172 # EscapedCharacter :: one of `"` \ `/` b f n r t
164173 escaped_character =
@@ -175,11 +184,15 @@ defmodule Absinthe.Lexer do
175184
176185 # StringCharacter ::
177186 # - SourceCharacter but not `"` or \ or LineTerminator
178- # - \u EscapedUnicode
187+ # - \u{ EscapedUnicode } (variable-width, September 2025 spec)
188+ # - \u EscapedUnicode (fixed-width, legacy)
179189 # - \ EscapedCharacter
180190 string_character =
181191 choice ( [
182- ignore ( string ( ~S( \u) ) ) |> concat ( escaped_unicode ) ,
192+ # Variable-width Unicode escape: \u{XXXXXX}
193+ ignore ( string ( ~S( \u) ) ) |> concat ( escaped_unicode_variable ) ,
194+ # Fixed-width Unicode escape: \uXXXX (with surrogate pair support)
195+ ignore ( string ( ~S( \u) ) ) |> concat ( escaped_unicode_fixed ) ,
183196 ignore ( ascii_char ( [ ?\\ ] ) ) |> concat ( escaped_character ) ,
184197 any_unicode
185198 ] )
@@ -233,6 +246,7 @@ defmodule Absinthe.Lexer do
233246 { :ok , [ any ( ) ] }
234247 | { :error , binary ( ) , { integer ( ) , non_neg_integer ( ) } }
235248 | { :error , :exceeded_token_limit }
249+ | { :error , :invalid_unicode_escape , binary ( ) , { integer ( ) , non_neg_integer ( ) } }
236250 def tokenize ( input , options \\ [ ] ) do
237251 lines = String . split ( input , ~r/ \r ?\n / )
238252
@@ -242,6 +256,12 @@ defmodule Absinthe.Lexer do
242256 { :error , @ stopped_at_token_limit , _ , _ , _ , _ } ->
243257 { :error , :exceeded_token_limit }
244258
259+ # Handle Unicode escape validation errors
260+ { :error , message , _rest , _context , { line , line_offset } , byte_offset }
261+ when is_binary ( message ) ->
262+ byte_column = byte_offset - line_offset + 1
263+ { :error , :invalid_unicode_escape , message , byte_loc_to_char_loc ( { line , byte_column } , lines ) }
264+
245265 { :ok , tokens , "" , _ , _ , _ } ->
246266 tokens = convert_token_columns_from_byte_to_char ( tokens , lines )
247267 { :ok , tokens }
@@ -364,11 +384,85 @@ defmodule Absinthe.Lexer do
364384
365385 defp fill_mantissa ( rest , raw , context , _ , _ ) , do: { rest , ~c" 0." ++ raw , context }
366386
367- defp unescape_unicode ( rest , content , context , _loc , _ ) do
387+ # Unicode scalar value validation per GraphQL September 2025 spec:
388+ # Valid ranges: U+0000 to U+D7FF, U+E000 to U+10FFFF
389+ # Invalid: surrogate code points U+D800 to U+DFFF (except as surrogate pairs in fixed-width)
390+ defp is_unicode_scalar_value? ( value ) when value >= 0x0000 and value <= 0xD7FF , do: true
391+ defp is_unicode_scalar_value? ( value ) when value >= 0xE000 and value <= 0x10FFFF , do: true
392+ defp is_unicode_scalar_value? ( _ ) , do: false
393+
394+ # Check if value is a high surrogate (U+D800 to U+DBFF)
395+ defp is_high_surrogate? ( value ) , do: value >= 0xD800 and value <= 0xDBFF
396+
397+ # Check if value is a low surrogate (U+DC00 to U+DFFF)
398+ defp is_low_surrogate? ( value ) , do: value >= 0xDC00 and value <= 0xDFFF
399+
400+ # Decode a surrogate pair to a Unicode scalar value
401+ defp decode_surrogate_pair ( high , low ) do
402+ 0x10000 + ( ( high - 0xD800 ) * 0x400 ) + ( low - 0xDC00 )
403+ end
404+
405+ # Variable-width Unicode escape: \u{XXXXXX}
406+ # Must be a valid Unicode scalar value (not a surrogate)
407+ defp unescape_unicode_variable ( rest , content , context , _loc , _ ) do
368408 code = content |> Enum . reverse ( )
369409 value = :erlang . list_to_integer ( code , 16 )
370- binary = :unicode . characters_to_binary ( [ value ] )
371- { rest , [ binary ] , context }
410+
411+ if is_unicode_scalar_value? ( value ) do
412+ binary = :unicode . characters_to_binary ( [ value ] )
413+ { rest , [ binary ] , context }
414+ else
415+ { :error , "Invalid Unicode scalar value in escape sequence" }
416+ end
417+ end
418+
419+ # Fixed-width Unicode escape: \uXXXX
420+ # Handles BMP characters and surrogate pairs for supplementary characters
421+ defp unescape_unicode_fixed ( rest , content , context , _loc , _ ) do
422+ code = content |> Enum . reverse ( )
423+ value = :erlang . list_to_integer ( code , 16 )
424+
425+ cond do
426+ # Valid BMP character (not a surrogate)
427+ is_unicode_scalar_value? ( value ) ->
428+ binary = :unicode . characters_to_binary ( [ value ] )
429+ { rest , [ binary ] , context }
430+
431+ # High surrogate - check for following low surrogate to form a pair
432+ is_high_surrogate? ( value ) ->
433+ case rest do
434+ # Look ahead for \uXXXX pattern
435+ << ?\\ , ?u , h1 , h2 , h3 , h4 , remaining :: binary >>
436+ when h1 in ~c" 0123456789ABCDEFabcdef" and
437+ h2 in ~c" 0123456789ABCDEFabcdef" and
438+ h3 in ~c" 0123456789ABCDEFabcdef" and
439+ h4 in ~c" 0123456789ABCDEFabcdef" ->
440+ low_code = [ h1 , h2 , h3 , h4 ]
441+ low_value = :erlang . list_to_integer ( low_code , 16 )
442+
443+ if is_low_surrogate? ( low_value ) do
444+ # Valid surrogate pair - decode to scalar value
445+ scalar = decode_surrogate_pair ( value , low_value )
446+ binary = :unicode . characters_to_binary ( [ scalar ] )
447+ { remaining , [ binary ] , context }
448+ else
449+ # High surrogate not followed by low surrogate
450+ { :error , "Invalid Unicode escape: high surrogate not followed by low surrogate" }
451+ end
452+
453+ _ ->
454+ # High surrogate without following escape sequence
455+ { :error , "Invalid Unicode escape: lone high surrogate" }
456+ end
457+
458+ # Lone low surrogate (invalid)
459+ is_low_surrogate? ( value ) ->
460+ { :error , "Invalid Unicode escape: lone low surrogate" }
461+
462+ # Out of range
463+ true ->
464+ { :error , "Invalid Unicode scalar value in escape sequence" }
465+ end
372466 end
373467
374468 @ boolean_words ~w(
0 commit comments