diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index ff071af..8c69eee 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -665,8 +665,33 @@ def return_tokens(self, toks: typing.Sequence[LexToken]) -> None: class LexerTokenStream(TokenStream): """ Provides tokens from using PlyLexer on the given input text + + This class also handles C++ digraphs (ISO C++ §2.6 [lex.digraph]), which + are two-character alternative representations for certain tokens:: + + <% -> { + %> -> } + <: -> [ + :> -> ] + %: -> # (preprocessor; rejected later as unsupported) + + Digraph recognition happens here, after the PLY tokenizer emits individual + characters, so the rest of the parser sees only the canonical tokens. """ + # Maps (first_token_type, second_token_value) -> replacement_token_type + # Only two-token digraph pairs that the PLY lexer will split are listed. + # The PLY lexer emits '<', '%', and ':' as literal tokens (single chars), + # so each digraph arrives as two consecutive tokens. + _digraph_map: typing.Dict[typing.Tuple[str, str], str] = { + ("%", ">"): "}", # %> -> } + ("<", "%"): "{", # <% -> { + ("<", ":"): "[", # <: -> [ + (":", ">"): "]", # :> -> ] + # %: -> # would produce a PP_DIRECTIVE; we leave it for the existing + # preprocessor-directive error path rather than silently mangling it. + } + _user_defined_literal_start = { "FLOAT_CONST", "HEX_FLOAT_CONST", @@ -702,9 +727,36 @@ def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool: return False udl_start = self._user_defined_literal_start + digraph_map = self._digraph_map while True: tok.location = self._lex.current_location() + + # Detect C++ digraphs: two consecutive literal tokens that together + # form an alternative representation of a single token (ISO C++ §2.6). + # We peek at the next token; if the pair is a known digraph we merge + # them into the canonical single-character token *before* appending + # to the buffer, so the rest of the parser never sees the raw digraph. + if tok.type in ("<", "%", ":"): + tok2 = get_token() + if tok2 is not None: + replacement = digraph_map.get((tok.type, tok2.value)) + if replacement is not None: + # Reuse tok, replace its type/value with the canonical token. + tok.type = replacement + tok.value = replacement + tokbuf.append(tok) + tok = get_token() + if tok is None: + break + continue + else: + # Not a digraph — process tok normally and re-queue tok2. + tokbuf.append(tok) + tok = tok2 + continue + # tok2 is None (EOF): fall through to append tok and return. + tokbuf.append(tok) if tok.type == "NEWLINE": diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py new file mode 100644 index 0000000..489f32e --- /dev/null +++ b/tests/test_digraphs.py @@ -0,0 +1,213 @@ +from cxxheaderparser.simple import parse_string, ParsedData +from cxxheaderparser.types import ( + FundamentalSpecifier, + NameSpecifier, + PQName, + Type, + Variable, + Function, + FunctionType, + Parameter, + Array, +) + + +def test_digraph_brace_open_close_function(): + """<% %> should work as { } in a function body context (body is skipped).""" + # The parser skips function bodies but must recognise the braces. + content = """\ +#include +int main() +<% + std::cout << "Hello, World!" << std::endl; + return 0; +%> +""" + # parse_string should not raise + result = parse_string(content) + assert isinstance(result, ParsedData) + + +def test_digraph_struct_body(): + """<% %> should work as { } around a struct body.""" + content = """\ +struct Point +<% + int x; + int y; +%>; +""" + result = parse_string(content) + assert len(result.namespace.classes) == 1 + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Point" + field_names = [f.name for f in cls.fields] + assert field_names == ["x", "y"] + + +def test_digraph_namespace_body(): + """<% %> should work as { } around a namespace body.""" + content = """\ +namespace myns +<% + int value; +%> +""" + result = parse_string(content) + assert "myns" in result.namespace.namespaces + ns = result.namespace.namespaces["myns"] + assert len(ns.variables) == 1 + assert ns.variables[0].name.segments[-1].name == "value" + + +def test_digraph_nested_braces(): + """Nested digraph brace pairs should work correctly.""" + content = """\ +namespace outer +<% + struct Inner + <% + int val; + %>; +%> +""" + result = parse_string(content) + assert "outer" in result.namespace.namespaces + ns = result.namespace.namespaces["outer"] + assert len(ns.classes) == 1 + inner = ns.classes[0] + assert inner.class_decl.typename.segments[-1].name == "Inner" + assert inner.fields[0].name == "val" + + +def test_digraph_mixed_braces(): + """Digraph and canonical braces can be mixed freely.""" + content = """\ +namespace ns +<% + struct Foo { + int a; + }; +%> +""" + result = parse_string(content) + assert "ns" in result.namespace.namespaces + ns = result.namespace.namespaces["ns"] + assert len(ns.classes) == 1 + + +def test_digraph_array_subscript(): + """<: :> should work as [ ] in an array declaration.""" + content = """\ +int arr<:10:>; +""" + result = parse_string(content) + assert len(result.namespace.variables) == 1 + var = result.namespace.variables[0] + assert var.name.segments[-1].name == "arr" + # The type should be an array of 10 ints + assert isinstance(var.type, Array) + + +def test_digraph_array_and_brace(): + """Both digraph pairs used together.""" + content = """\ +struct Grid +<% + float data<:4:>; +%>; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Grid" + field = cls.fields[0] + assert field.name == "data" + assert isinstance(field.type, Array) + + +def test_canonical_tokens_unaffected(): + """Normal { } [ ] tokens must continue to work after digraph support.""" + content = """\ +namespace ns { + struct Foo { + int arr[5]; + }; +} +""" + result = parse_string(content) + assert "ns" in result.namespace.namespaces + ns = result.namespace.namespaces["ns"] + cls = ns.classes[0] + assert cls.fields[0].name == "arr" + + +def test_template_angle_brackets_unaffected(): + """< > used as template angle brackets must NOT be treated as digraphs.""" + content = """\ +template +struct Container +{ + T data[N]; +}; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Container" + assert len(cls.class_decl.template.params) == 2 + + +def test_shift_left_unaffected(): + """The << operator (SHIFT_LEFT token) must not be affected by digraph detection.""" + content = """\ +template +void fn(T x); +""" + result = parse_string(content) + assert len(result.namespace.functions) == 1 + assert result.namespace.functions[0].name.segments[-1].name == "fn" + + +def test_percent_operator_unaffected(): + """A bare % in an expression context must not be altered.""" + # The parser skips default parameter expressions, so we embed % there. + content = """\ +void fn(int x = 10 % 3); +""" + result = parse_string(content) + assert result.namespace.functions[0].name.segments[-1].name == "fn" + + +def test_digraph_enum(): + """<% %> should work as { } in an enum definition.""" + content = """\ +enum Color +<% + Red, + Green, + Blue +%>; +""" + result = parse_string(content) + assert len(result.namespace.enums) == 1 + en = result.namespace.enums[0] + values = [v.name for v in en.values] + assert values == ["Red", "Green", "Blue"] + + +def test_digraph_class_with_methods(): + """<% %> braces work for a class with member function declarations.""" + content = """\ +class MyClass +<% +public: + MyClass(); + ~MyClass(); + int getValue() const; +%>; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "MyClass" + method_names = [m.name.segments[-1].name for m in cls.methods] + assert "MyClass" in method_names + assert "getValue" in method_names