Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions cxxheaderparser/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,8 +665,33 @@ def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
class LexerTokenStream(TokenStream):
"""
Provides tokens from using PlyLexer on the given input text

This class also handles C++ digraphs (ISO C++ §2.6 [lex.digraph]), which
are two-character alternative representations for certain tokens::

<% -> {
%> -> }
<: -> [
:> -> ]
%: -> # (preprocessor; rejected later as unsupported)

Digraph recognition happens here, after the PLY tokenizer emits individual
characters, so the rest of the parser sees only the canonical tokens.
"""

# Maps (first_token_type, second_token_value) -> replacement_token_type
# Only two-token digraph pairs that the PLY lexer will split are listed.
# The PLY lexer emits '<', '%', and ':' as literal tokens (single chars),
# so each digraph arrives as two consecutive tokens.
_digraph_map: typing.Dict[typing.Tuple[str, str], str] = {
("%", ">"): "}", # %> -> }
("<", "%"): "{", # <% -> {
("<", ":"): "[", # <: -> [
(":", ">"): "]", # :> -> ]
# %: -> # would produce a PP_DIRECTIVE; we leave it for the existing
# preprocessor-directive error path rather than silently mangling it.
}

_user_defined_literal_start = {
"FLOAT_CONST",
"HEX_FLOAT_CONST",
Expand Down Expand Up @@ -702,9 +727,36 @@ def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
return False

udl_start = self._user_defined_literal_start
digraph_map = self._digraph_map

while True:
tok.location = self._lex.current_location()

# Detect C++ digraphs: two consecutive literal tokens that together
# form an alternative representation of a single token (ISO C++ §2.6).
# We peek at the next token; if the pair is a known digraph we merge
# them into the canonical single-character token *before* appending
# to the buffer, so the rest of the parser never sees the raw digraph.
if tok.type in ("<", "%", ":"):
tok2 = get_token()
if tok2 is not None:
replacement = digraph_map.get((tok.type, tok2.value))
if replacement is not None:
# Reuse tok, replace its type/value with the canonical token.
tok.type = replacement
tok.value = replacement
tokbuf.append(tok)
tok = get_token()
if tok is None:
break
continue
else:
# Not a digraph — process tok normally and re-queue tok2.
tokbuf.append(tok)
tok = tok2
continue
# tok2 is None (EOF): fall through to append tok and return.

tokbuf.append(tok)

if tok.type == "NEWLINE":
Expand Down
213 changes: 213 additions & 0 deletions tests/test_digraphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
from cxxheaderparser.simple import parse_string, ParsedData
from cxxheaderparser.types import (
FundamentalSpecifier,
NameSpecifier,
PQName,
Type,
Variable,
Function,
FunctionType,
Parameter,
Array,
)


def test_digraph_brace_open_close_function():
"""<% %> should work as { } in a function body context (body is skipped)."""
# The parser skips function bodies but must recognise the braces.
content = """\
#include <iostream>

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For any future contributions, please use python -m cxxheaderparser.gentest to generate tests.

int main()
<%
std::cout << "Hello, World!" << std::endl;
return 0;
%>
"""
# parse_string should not raise
result = parse_string(content)
assert isinstance(result, ParsedData)


def test_digraph_struct_body():
"""<% %> should work as { } around a struct body."""
content = """\
struct Point
<%
int x;
int y;
%>;
"""
result = parse_string(content)
assert len(result.namespace.classes) == 1
cls = result.namespace.classes[0]
assert cls.class_decl.typename.segments[-1].name == "Point"
field_names = [f.name for f in cls.fields]
assert field_names == ["x", "y"]


def test_digraph_namespace_body():
"""<% %> should work as { } around a namespace body."""
content = """\
namespace myns
<%
int value;
%>
"""
result = parse_string(content)
assert "myns" in result.namespace.namespaces
ns = result.namespace.namespaces["myns"]
assert len(ns.variables) == 1
assert ns.variables[0].name.segments[-1].name == "value"


def test_digraph_nested_braces():
"""Nested digraph brace pairs should work correctly."""
content = """\
namespace outer
<%
struct Inner
<%
int val;
%>;
%>
"""
result = parse_string(content)
assert "outer" in result.namespace.namespaces
ns = result.namespace.namespaces["outer"]
assert len(ns.classes) == 1
inner = ns.classes[0]
assert inner.class_decl.typename.segments[-1].name == "Inner"
assert inner.fields[0].name == "val"


def test_digraph_mixed_braces():
"""Digraph and canonical braces can be mixed freely."""
content = """\
namespace ns
<%
struct Foo {
int a;
};
%>
"""
result = parse_string(content)
assert "ns" in result.namespace.namespaces
ns = result.namespace.namespaces["ns"]
assert len(ns.classes) == 1


def test_digraph_array_subscript():
"""<: :> should work as [ ] in an array declaration."""
content = """\
int arr<:10:>;
"""
result = parse_string(content)
assert len(result.namespace.variables) == 1
var = result.namespace.variables[0]
assert var.name.segments[-1].name == "arr"
# The type should be an array of 10 ints
assert isinstance(var.type, Array)


def test_digraph_array_and_brace():
"""Both digraph pairs used together."""
content = """\
struct Grid
<%
float data<:4:>;
%>;
"""
result = parse_string(content)
cls = result.namespace.classes[0]
assert cls.class_decl.typename.segments[-1].name == "Grid"
field = cls.fields[0]
assert field.name == "data"
assert isinstance(field.type, Array)


def test_canonical_tokens_unaffected():
"""Normal { } [ ] tokens must continue to work after digraph support."""
content = """\
namespace ns {
struct Foo {
int arr[5];
};
}
"""
result = parse_string(content)
assert "ns" in result.namespace.namespaces
ns = result.namespace.namespaces["ns"]
cls = ns.classes[0]
assert cls.fields[0].name == "arr"


def test_template_angle_brackets_unaffected():
"""< > used as template angle brackets must NOT be treated as digraphs."""
content = """\
template <typename T, int N>
struct Container
{
T data[N];
};
"""
result = parse_string(content)
cls = result.namespace.classes[0]
assert cls.class_decl.typename.segments[-1].name == "Container"
assert len(cls.class_decl.template.params) == 2


def test_shift_left_unaffected():
"""The << operator (SHIFT_LEFT token) must not be affected by digraph detection."""
content = """\
template <typename T>
void fn(T x);
"""
result = parse_string(content)
assert len(result.namespace.functions) == 1
assert result.namespace.functions[0].name.segments[-1].name == "fn"


def test_percent_operator_unaffected():
"""A bare % in an expression context must not be altered."""
# The parser skips default parameter expressions, so we embed % there.
content = """\
void fn(int x = 10 % 3);
"""
result = parse_string(content)
assert result.namespace.functions[0].name.segments[-1].name == "fn"


def test_digraph_enum():
"""<% %> should work as { } in an enum definition."""
content = """\
enum Color
<%
Red,
Green,
Blue
%>;
"""
result = parse_string(content)
assert len(result.namespace.enums) == 1
en = result.namespace.enums[0]
values = [v.name for v in en.values]
assert values == ["Red", "Green", "Blue"]


def test_digraph_class_with_methods():
"""<% %> braces work for a class with member function declarations."""
content = """\
class MyClass
<%
public:
MyClass();
~MyClass();
int getValue() const;
%>;
"""
result = parse_string(content)
cls = result.namespace.classes[0]
assert cls.class_decl.typename.segments[-1].name == "MyClass"
method_names = [m.name.segments[-1].name for m in cls.methods]
assert "MyClass" in method_names
assert "getValue" in method_names
Loading