Skip to content

Commit d4c673a

Browse files
authored
test: improve Pygments test reliability and diagnostics (#1)
- Use proc_open to capture stdout/stderr and debug "unknown" CI failures. - Improve Python/Pygments detection to ensure tests skip correctly when missing. - Include process output in error messages for better troubleshooting.
1 parent bdeb9c3 commit d4c673a

2 files changed

Lines changed: 228 additions & 70 deletions

File tree

bin/pygments-tokenize

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Tokenize source code using Pygments and output JSON.
4+
5+
Usage:
6+
pygments-tokenize <language> < input.code
7+
pygments-tokenize <language> <file>
8+
9+
Output: JSON array of [token_type, token_text] pairs.
10+
11+
This script is used by the test suite to compare the library's output
12+
against Pygments, the de-facto standard for syntax highlighting.
13+
14+
Requirements:
15+
pip install pygments
16+
"""
17+
18+
import json
19+
import sys
20+
21+
try:
22+
from pygments.lexers import get_lexer_by_name
23+
from pygments.token import Token
24+
except ImportError:
25+
print(json.dumps({"error": "pygments is not installed. Install it with: pip install pygments"}))
26+
sys.exit(1)
27+
28+
# Map alto language identifiers to Pygments lexer names
29+
LANGUAGE_MAP = {
30+
"php": ("php", {"startinline": False}),
31+
"html": ("html", {}),
32+
"svg": ("html", {}),
33+
"xml": ("xml", {}),
34+
"yaml": ("yaml", {}),
35+
"sql": ("sql", {}),
36+
"json": ("json", {}),
37+
"css": ("css", {}),
38+
"scss": ("scss", {}),
39+
"markdown": ("markdown", {}),
40+
"javascript": ("javascript", {}),
41+
"typescript": ("typescript", {}),
42+
"twig": ("twig", {}),
43+
"makefile": ("makefile", {}),
44+
"bash": ("bash", {}),
45+
"ini": ("ini", {}),
46+
"http": ("http", {}),
47+
"go": ("go", {}),
48+
"rust": ("rust", {}),
49+
"ruby": ("ruby", {}),
50+
"swift": ("swift", {}),
51+
"python": ("python3", {}),
52+
"java": ("java", {}),
53+
"csharp": ("csharp", {}),
54+
"dockerfile": ("docker", {}),
55+
"diff": ("diff", {}),
56+
"dotenv": ("bash", {}),
57+
}
58+
59+
# Map Pygments token types to alto scope categories
60+
SCOPE_CATEGORY_MAP = {
61+
Token.Comment: "comment",
62+
Token.Comment.Single: "comment",
63+
Token.Comment.Multiline: "comment",
64+
Token.Comment.Preproc: "comment",
65+
Token.Comment.PreprocFile: "comment",
66+
Token.Comment.Special: "comment",
67+
Token.Comment.Hashbang: "comment",
68+
Token.Keyword: "keyword",
69+
Token.Keyword.Declaration: "keyword",
70+
Token.Keyword.Namespace: "keyword",
71+
Token.Keyword.Pseudo: "keyword",
72+
Token.Keyword.Reserved: "keyword",
73+
Token.Keyword.Type: "keyword",
74+
Token.Keyword.Constant: "keyword",
75+
Token.Name.Function: "function",
76+
Token.Name.Function.Magic: "function",
77+
Token.Name.Class: "type",
78+
Token.Name.Decorator: "attribute",
79+
Token.Name.Variable: "variable",
80+
Token.Name.Variable.Class: "variable",
81+
Token.Name.Variable.Global: "variable",
82+
Token.Name.Variable.Instance: "variable",
83+
Token.Name.Variable.Magic: "variable",
84+
Token.Name.Attribute: "property",
85+
Token.Name.Builtin: "builtin",
86+
Token.Name.Builtin.Pseudo: "builtin",
87+
Token.Name.Tag: "tag",
88+
Token.Name.Namespace: "namespace",
89+
Token.Name.Constant: "constant",
90+
Token.Literal.String: "string",
91+
Token.Literal.String.Single: "string",
92+
Token.Literal.String.Double: "string",
93+
Token.Literal.String.Backtick: "string",
94+
Token.Literal.String.Doc: "string",
95+
Token.Literal.String.Escape: "string",
96+
Token.Literal.String.Heredoc: "string",
97+
Token.Literal.String.Interpol: "string",
98+
Token.Literal.String.Other: "string",
99+
Token.Literal.String.Regex: "regexp",
100+
Token.Literal.String.Symbol: "string",
101+
Token.Literal.String.Affix: "string",
102+
Token.Literal.Number: "number",
103+
Token.Literal.Number.Bin: "number",
104+
Token.Literal.Number.Float: "number",
105+
Token.Literal.Number.Hex: "number",
106+
Token.Literal.Number.Integer: "number",
107+
Token.Literal.Number.Integer.Long: "number",
108+
Token.Literal.Number.Oct: "number",
109+
Token.Operator: "operator",
110+
Token.Operator.Word: "operator",
111+
Token.Punctuation: "punctuation",
112+
Token.Punctuation.Marker: "punctuation",
113+
Token.Generic.Inserted: "diff.added",
114+
Token.Generic.Deleted: "diff.removed",
115+
Token.Generic.Heading: "section",
116+
Token.Generic.Subheading: "section",
117+
Token.Name.Label: "label",
118+
Token.Name.Entity: "entity",
119+
}
120+
121+
122+
def get_scope_category(token_type):
123+
"""Map a Pygments token type to a simplified scope category."""
124+
# Try exact match first, then walk up the token hierarchy
125+
current = token_type
126+
while current:
127+
if current in SCOPE_CATEGORY_MAP:
128+
return SCOPE_CATEGORY_MAP[current]
129+
if current.parent:
130+
current = current.parent
131+
else:
132+
break
133+
return "other"
134+
135+
136+
def tokenize(language, code):
137+
"""Tokenize code with Pygments and return categorized tokens."""
138+
if language not in LANGUAGE_MAP:
139+
return {"error": f"Unsupported language: {language}"}
140+
141+
lexer_name, options = LANGUAGE_MAP[language]
142+
lexer = get_lexer_by_name(lexer_name, **options)
143+
144+
tokens = []
145+
for tok_type, tok_value in lexer.get_tokens(code):
146+
# Skip pure whitespace
147+
if tok_type in Token.Text or not tok_value.strip():
148+
continue
149+
150+
category = get_scope_category(tok_type)
151+
tokens.append({
152+
"text": tok_value,
153+
"pygments_type": str(tok_type),
154+
"category": category,
155+
})
156+
157+
return {"tokens": tokens, "language": language}
158+
159+
160+
def main():
161+
if len(sys.argv) < 2:
162+
print(json.dumps({"error": "Usage: pygments-tokenize <language> [file]"}))
163+
sys.exit(1)
164+
165+
language = sys.argv[1].lower()
166+
167+
if len(sys.argv) >= 3:
168+
with open(sys.argv[2], "r") as f:
169+
code = f.read()
170+
else:
171+
code = sys.stdin.read()
172+
173+
result = tokenize(language, code)
174+
print(json.dumps(result, ensure_ascii=False))
175+
176+
177+
if __name__ == "__main__":
178+
main()

0 commit comments

Comments
 (0)