Skip to content

Commit 86fa4b6

Browse files
jwaldripclaude
andcommitted
feat: implement full Unicode support per GraphQL September 2025 spec
This implements Full Unicode Support as defined in GraphQL specification September 2025 (RFCs #805, #1040, #1053, #1142). Changes: - Add support for variable-width Unicode escape sequences (\u{XXXXXX}) allowing representation of all Unicode scalar values up to U+10FFFF - Add validation for Unicode scalar values in escape sequences - Add support for surrogate pair decoding in fixed-width escapes (\uXXXX) for legacy compatibility with supplementary plane characters - Properly reject invalid escape sequences: - Lone high surrogates (U+D800-U+DBFF) - Lone low surrogates (U+DC00-U+DFFF) - Out of range values (>U+10FFFF) - Surrogates in variable-width escapes - Update Parse phase to handle new Unicode escape error type - Add comprehensive test suite covering: - Basic Unicode in strings - BMP escape sequences (\uXXXX) - Extended escape sequences (\u{XXXXXX}) - Surrogate pair handling - Emoji and supplementary plane characters - Invalid escape rejection - Block strings with Unicode - Edge cases The implementation maintains full backward compatibility with existing GraphQL documents while enabling the new Unicode features. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent a035261 commit 86fa4b6

3 files changed

Lines changed: 707 additions & 8 deletions

File tree

lib/absinthe/lexer.ex

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,19 @@ defmodule Absinthe.Lexer do
155155
])
156156
|> post_traverse({:labeled_token, [:float_value]})
157157

158-
# EscapedUnicode :: /[0-9A-Fa-f]{4}/
159-
escaped_unicode =
158+
# EscapedUnicode (Fixed-width) :: /[0-9A-Fa-f]{4}/
159+
# Per GraphQL September 2025 spec, this supports BMP characters and surrogate pairs
160+
escaped_unicode_fixed =
160161
times(ascii_char([?0..?9, ?A..?F, ?a..?f]), 4)
161-
|> post_traverse({:unescape_unicode, []})
162+
|> post_traverse({:unescape_unicode_fixed, []})
163+
164+
# EscapedUnicode (Variable-width) :: \u{ [0-9A-Fa-f]+ }
165+
# Per GraphQL September 2025 spec, supports full Unicode range U+0000 to U+10FFFF
166+
escaped_unicode_variable =
167+
ignore(ascii_char([?{]))
168+
|> times(ascii_char([?0..?9, ?A..?F, ?a..?f]), min: 1)
169+
|> ignore(ascii_char([?}]))
170+
|> post_traverse({:unescape_unicode_variable, []})
162171

163172
# EscapedCharacter :: one of `"` \ `/` b f n r t
164173
escaped_character =
@@ -175,11 +184,15 @@ defmodule Absinthe.Lexer do
175184

176185
# StringCharacter ::
177186
# - SourceCharacter but not `"` or \ or LineTerminator
178-
# - \u EscapedUnicode
187+
# - \u{ EscapedUnicode } (variable-width, September 2025 spec)
188+
# - \u EscapedUnicode (fixed-width, legacy)
179189
# - \ EscapedCharacter
180190
string_character =
181191
choice([
182-
ignore(string(~S(\u))) |> concat(escaped_unicode),
192+
# Variable-width Unicode escape: \u{XXXXXX}
193+
ignore(string(~S(\u))) |> concat(escaped_unicode_variable),
194+
# Fixed-width Unicode escape: \uXXXX (with surrogate pair support)
195+
ignore(string(~S(\u))) |> concat(escaped_unicode_fixed),
183196
ignore(ascii_char([?\\])) |> concat(escaped_character),
184197
any_unicode
185198
])
@@ -233,6 +246,7 @@ defmodule Absinthe.Lexer do
233246
{:ok, [any()]}
234247
| {:error, binary(), {integer(), non_neg_integer()}}
235248
| {:error, :exceeded_token_limit}
249+
| {:error, :invalid_unicode_escape, binary(), {integer(), non_neg_integer()}}
236250
def tokenize(input, options \\ []) do
237251
lines = String.split(input, ~r/\r?\n/)
238252

@@ -242,6 +256,12 @@ defmodule Absinthe.Lexer do
242256
{:error, @stopped_at_token_limit, _, _, _, _} ->
243257
{:error, :exceeded_token_limit}
244258

259+
# Handle Unicode escape validation errors
260+
{:error, message, _rest, _context, {line, line_offset}, byte_offset}
261+
when is_binary(message) ->
262+
byte_column = byte_offset - line_offset + 1
263+
{:error, :invalid_unicode_escape, message, byte_loc_to_char_loc({line, byte_column}, lines)}
264+
245265
{:ok, tokens, "", _, _, _} ->
246266
tokens = convert_token_columns_from_byte_to_char(tokens, lines)
247267
{:ok, tokens}
@@ -364,11 +384,85 @@ defmodule Absinthe.Lexer do
364384

365385
defp fill_mantissa(rest, raw, context, _, _), do: {rest, ~c"0." ++ raw, context}
366386

367-
defp unescape_unicode(rest, content, context, _loc, _) do
387+
# Unicode scalar value validation per GraphQL September 2025 spec:
388+
# Valid ranges: U+0000 to U+D7FF, U+E000 to U+10FFFF
389+
# Invalid: surrogate code points U+D800 to U+DFFF (except as surrogate pairs in fixed-width)
390+
defp is_unicode_scalar_value?(value) when value >= 0x0000 and value <= 0xD7FF, do: true
391+
defp is_unicode_scalar_value?(value) when value >= 0xE000 and value <= 0x10FFFF, do: true
392+
defp is_unicode_scalar_value?(_), do: false
393+
394+
# Check if value is a high surrogate (U+D800 to U+DBFF)
395+
defp is_high_surrogate?(value), do: value >= 0xD800 and value <= 0xDBFF
396+
397+
# Check if value is a low surrogate (U+DC00 to U+DFFF)
398+
defp is_low_surrogate?(value), do: value >= 0xDC00 and value <= 0xDFFF
399+
400+
# Decode a surrogate pair to a Unicode scalar value
401+
defp decode_surrogate_pair(high, low) do
402+
0x10000 + ((high - 0xD800) * 0x400) + (low - 0xDC00)
403+
end
404+
405+
# Variable-width Unicode escape: \u{XXXXXX}
406+
# Must be a valid Unicode scalar value (not a surrogate)
407+
defp unescape_unicode_variable(rest, content, context, _loc, _) do
368408
code = content |> Enum.reverse()
369409
value = :erlang.list_to_integer(code, 16)
370-
binary = :unicode.characters_to_binary([value])
371-
{rest, [binary], context}
410+
411+
if is_unicode_scalar_value?(value) do
412+
binary = :unicode.characters_to_binary([value])
413+
{rest, [binary], context}
414+
else
415+
{:error, "Invalid Unicode scalar value in escape sequence"}
416+
end
417+
end
418+
419+
# Fixed-width Unicode escape: \uXXXX
420+
# Handles BMP characters and surrogate pairs for supplementary characters
421+
defp unescape_unicode_fixed(rest, content, context, _loc, _) do
422+
code = content |> Enum.reverse()
423+
value = :erlang.list_to_integer(code, 16)
424+
425+
cond do
426+
# Valid BMP character (not a surrogate)
427+
is_unicode_scalar_value?(value) ->
428+
binary = :unicode.characters_to_binary([value])
429+
{rest, [binary], context}
430+
431+
# High surrogate - check for following low surrogate to form a pair
432+
is_high_surrogate?(value) ->
433+
case rest do
434+
# Look ahead for \uXXXX pattern
435+
<<?\\, ?u, h1, h2, h3, h4, remaining::binary>>
436+
when h1 in ~c"0123456789ABCDEFabcdef" and
437+
h2 in ~c"0123456789ABCDEFabcdef" and
438+
h3 in ~c"0123456789ABCDEFabcdef" and
439+
h4 in ~c"0123456789ABCDEFabcdef" ->
440+
low_code = [h1, h2, h3, h4]
441+
low_value = :erlang.list_to_integer(low_code, 16)
442+
443+
if is_low_surrogate?(low_value) do
444+
# Valid surrogate pair - decode to scalar value
445+
scalar = decode_surrogate_pair(value, low_value)
446+
binary = :unicode.characters_to_binary([scalar])
447+
{remaining, [binary], context}
448+
else
449+
# High surrogate not followed by low surrogate
450+
{:error, "Invalid Unicode escape: high surrogate not followed by low surrogate"}
451+
end
452+
453+
_ ->
454+
# High surrogate without following escape sequence
455+
{:error, "Invalid Unicode escape: lone high surrogate"}
456+
end
457+
458+
# Lone low surrogate (invalid)
459+
is_low_surrogate?(value) ->
460+
{:error, "Invalid Unicode escape: lone low surrogate"}
461+
462+
# Out of range
463+
true ->
464+
{:error, "Invalid Unicode scalar value in escape sequence"}
465+
end
372466
end
373467

374468
@boolean_words ~w(

lib/absinthe/phase/parse.ex

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ defmodule Absinthe.Phase.Parse do
5151
{:error, :exceeded_token_limit} ->
5252
{:error, %Phase.Error{message: "Token limit exceeded", phase: __MODULE__}}
5353

54+
{:error, :invalid_unicode_escape, message, loc} ->
55+
{:error, format_raw_parse_error({:unicode_escape, message, loc})}
56+
5457
other ->
5558
other
5659
end
@@ -113,6 +116,12 @@ defmodule Absinthe.Phase.Parse do
113116
%Phase.Error{message: message, locations: [%{line: line, column: column}], phase: __MODULE__}
114117
end
115118

119+
@spec format_raw_parse_error({:unicode_escape, String.t(), {line :: pos_integer, column :: pos_integer}}) ::
120+
Phase.Error.t()
121+
defp format_raw_parse_error({:unicode_escape, message, {line, column}}) do
122+
%Phase.Error{message: message, locations: [%{line: line, column: column}], phase: __MODULE__}
123+
end
124+
116125
@unknown_error_msg "An unknown error occurred during parsing"
117126
@spec format_raw_parse_error(map) :: Phase.Error.t()
118127
defp format_raw_parse_error(%{} = error) do

0 commit comments

Comments
 (0)