Skip to content

Commit 7b77386

Browse files
committed
Evo: add more missed token/range mappings.
1 parent eff0381 commit 7b77386

4 files changed

Lines changed: 460 additions & 4 deletions

File tree

check_ce_token_mappings.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import re
4+
import xml.etree.ElementTree as ET
5+
from pathlib import Path
6+
7+
8+
ROOT = Path(__file__).resolve().parent
9+
TOKENS_XML = ROOT / "ti-toolkit-8x-tokens.xml"
10+
EVO_FORMAT = ROOT / "src" / "EvoFormat.cpp"
11+
12+
13+
def hex_word(value):
14+
return f"0x{value:04X}"
15+
16+
17+
def extract_function(source, name):
18+
header = re.search(rf"static\s+bool\s+{name}\s*\([^)]*\)\s*\{{", source)
19+
if not header:
20+
raise RuntimeError(f"Could not find {name}")
21+
22+
brace = header.end() - 1
23+
depth = 0
24+
for index in range(brace, len(source)):
25+
if source[index] == "{":
26+
depth += 1
27+
elif source[index] == "}":
28+
depth -= 1
29+
if depth == 0:
30+
return source[brace + 1:index]
31+
32+
raise RuntimeError(f"Could not find end of {name}")
33+
34+
35+
def token_label(token_element):
36+
for lang in token_element.findall(".//lang"):
37+
if lang.get("code") == "en":
38+
display = lang.get("display")
39+
accessible = lang.findtext("accessible")
40+
if display:
41+
return display
42+
if accessible:
43+
return accessible
44+
return "(unnamed)"
45+
46+
47+
def parse_ce_tokens(path):
48+
root = ET.fromstring(path.read_text())
49+
tokens = []
50+
for child in root:
51+
if child.tag == "token":
52+
tokens.append((int(child.get("value")[1:], 16), token_label(child)))
53+
elif child.tag == "two-byte":
54+
high = int(child.get("value")[1:], 16)
55+
for token in child.findall("token"):
56+
low = int(token.get("value")[1:], 16)
57+
tokens.append(((high << 8) | low, token_label(token)))
58+
return sorted(tokens)
59+
60+
61+
def parse_direct_map(function_body):
62+
map_match = re.search(
63+
r"static\s+const\s+std::unordered_map<[^>]+>\s+direct\s*=\s*\{(?P<body>.*?)\n\s*\};",
64+
function_body,
65+
re.S,
66+
)
67+
if not map_match:
68+
return []
69+
70+
pair_pattern = re.compile(r"\{\s*(0x[0-9A-Fa-f]+)\s*,\s*(0x[0-9A-Fa-f]+)\s*\}")
71+
return [
72+
(int(left, 16), int(right, 16))
73+
for left, right in pair_pattern.findall(map_match.group("body"))
74+
]
75+
76+
77+
def parse_range_maps(function_body, source_var, dest_var):
78+
range_pattern = re.compile(
79+
rf"if\s*\(\s*{source_var}\s*>=\s*(0x[0-9A-Fa-f]+)\s*&&\s*"
80+
rf"{source_var}\s*<=\s*(0x[0-9A-Fa-f]+)\s*\)\s*\{{(?P<body>.*?)return\s+true\s*;",
81+
re.S,
82+
)
83+
assignment_pattern = re.compile(
84+
rf"{dest_var}\s*=\s*static_cast<uint16_t>\(\s*"
85+
rf"(0x[0-9A-Fa-f]+)\s*\+\s*\(\s*{source_var}\s*-\s*"
86+
rf"(0x[0-9A-Fa-f]+)\s*\)\s*\)"
87+
)
88+
89+
pairs = []
90+
for match in range_pattern.finditer(function_body):
91+
assign = assignment_pattern.search(match.group("body"))
92+
if not assign:
93+
continue
94+
95+
source_first = int(match.group(1), 16)
96+
source_last = int(match.group(2), 16)
97+
dest_base = int(assign.group(1), 16)
98+
source_base = int(assign.group(2), 16)
99+
100+
for source in range(source_first, source_last + 1):
101+
pairs.append((source, dest_base + (source - source_base)))
102+
return pairs
103+
104+
105+
def build_mappings(format_path):
106+
source = format_path.read_text()
107+
108+
evo_to_legacy_body = extract_function(source, "direct_legacy_token_for_evo")
109+
legacy_to_evo_body = extract_function(source, "direct_evo_token_for_legacy")
110+
111+
evo_to_legacy = dict(parse_range_maps(evo_to_legacy_body, "evoToken", "legacyToken"))
112+
evo_to_legacy.update(parse_direct_map(evo_to_legacy_body))
113+
114+
legacy_to_evo = dict(parse_range_maps(legacy_to_evo_body, "legacyToken", "evoToken"))
115+
legacy_to_evo.update(parse_direct_map(legacy_to_evo_body))
116+
117+
return evo_to_legacy, legacy_to_evo
118+
119+
120+
def print_section(title, rows):
121+
print(f"{title}: {len(rows)}")
122+
for token, name in rows:
123+
print(f" {hex_word(token)} {name}")
124+
125+
126+
def compact_ranges(rows):
127+
if not rows:
128+
return []
129+
130+
grouped = []
131+
start_token, start_name = rows[0]
132+
prev_token, prev_name = rows[0]
133+
for token, name in rows[1:]:
134+
if token == prev_token + 1:
135+
prev_token, prev_name = token, name
136+
continue
137+
138+
grouped.append((start_token, start_name, prev_token, prev_name))
139+
start_token, start_name = token, name
140+
prev_token, prev_name = token, name
141+
142+
grouped.append((start_token, start_name, prev_token, prev_name))
143+
return grouped
144+
145+
146+
def print_range_section(title, rows):
147+
print(f"{title}: {len(rows)} tokens in {len(compact_ranges(rows))} ranges")
148+
for first_token, first_name, last_token, last_name in compact_ranges(rows):
149+
if first_token == last_token:
150+
print(f" {hex_word(first_token)} {first_name}")
151+
else:
152+
print(f" {hex_word(first_token)}..{hex_word(last_token)} {first_name}..{last_name}")
153+
154+
155+
def main():
156+
parser = argparse.ArgumentParser(
157+
description="Check ti-toolkit-8x-tokens.xml CE token coverage in CE<->Evo token conversion mappings."
158+
)
159+
parser.add_argument(
160+
"--ranges",
161+
action="store_true",
162+
help="Print missing tokens as compact contiguous ranges instead of one token per line.",
163+
)
164+
args = parser.parse_args()
165+
166+
tokens = parse_ce_tokens(TOKENS_XML)
167+
evo_to_legacy, legacy_to_evo = build_mappings(EVO_FORMAT)
168+
evo_to_legacy_values = set(evo_to_legacy.values())
169+
170+
missing_legacy_to_evo = [
171+
(token, name) for token, name in tokens if token not in legacy_to_evo
172+
]
173+
missing_evo_to_legacy = [
174+
(token, name) for token, name in tokens if token not in evo_to_legacy_values
175+
]
176+
177+
print(f"XML CE tokens checked: {len(tokens)}")
178+
print(f"CE->Evo mapped CE tokens: {len(legacy_to_evo)}")
179+
print(f"Evo->CE produced CE tokens: {len(evo_to_legacy_values)}")
180+
print()
181+
section_printer = print_range_section if args.ranges else print_section
182+
section_printer("Missing from CE->Evo mapping", missing_legacy_to_evo)
183+
print()
184+
section_printer("Not produced by Evo->CE mapping", missing_evo_to_legacy)
185+
186+
return 1 if missing_legacy_to_evo or missing_evo_to_legacy else 0
187+
188+
189+
if __name__ == "__main__":
190+
raise SystemExit(main())

check_evo_token_mappings.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import re
4+
from pathlib import Path
5+
6+
7+
ROOT = Path(__file__).resolve().parent
8+
EVO_TOKENS = ROOT / "src" / "EvoTokens.inc"
9+
EVO_FORMAT = ROOT / "src" / "EvoFormat.cpp"
10+
11+
12+
def hex_word(value):
13+
return f"0x{value:04X}"
14+
15+
16+
def extract_function(source, name):
17+
header = re.search(rf"static\s+bool\s+{name}\s*\([^)]*\)\s*\{{", source)
18+
if not header:
19+
raise RuntimeError(f"Could not find {name}")
20+
21+
brace = header.end() - 1
22+
depth = 0
23+
for index in range(brace, len(source)):
24+
if source[index] == "{":
25+
depth += 1
26+
elif source[index] == "}":
27+
depth -= 1
28+
if depth == 0:
29+
return source[brace + 1:index]
30+
31+
raise RuntimeError(f"Could not find end of {name}")
32+
33+
34+
def parse_evo_tokens(path):
35+
pattern = re.compile(r'\{\s*(0x[0-9A-Fa-f]+)\s*,\s*"([^"]+)"\s*\}')
36+
tokens = []
37+
for match in pattern.finditer(path.read_text()):
38+
tokens.append((int(match.group(1), 16), match.group(2)))
39+
return tokens
40+
41+
42+
def parse_direct_map(function_body):
43+
map_match = re.search(
44+
r"static\s+const\s+std::unordered_map<[^>]+>\s+direct\s*=\s*\{(?P<body>.*?)\n\s*\};",
45+
function_body,
46+
re.S,
47+
)
48+
if not map_match:
49+
return []
50+
51+
pair_pattern = re.compile(r"\{\s*(0x[0-9A-Fa-f]+)\s*,\s*(0x[0-9A-Fa-f]+)\s*\}")
52+
return [
53+
(int(left, 16), int(right, 16))
54+
for left, right in pair_pattern.findall(map_match.group("body"))
55+
]
56+
57+
58+
def parse_range_maps(function_body, source_var, dest_var):
59+
range_pattern = re.compile(
60+
rf"if\s*\(\s*{source_var}\s*>=\s*(0x[0-9A-Fa-f]+)\s*&&\s*"
61+
rf"{source_var}\s*<=\s*(0x[0-9A-Fa-f]+)\s*\)\s*\{{(?P<body>.*?)return\s+true\s*;",
62+
re.S,
63+
)
64+
assignment_pattern = re.compile(
65+
rf"{dest_var}\s*=\s*static_cast<uint16_t>\(\s*"
66+
rf"(0x[0-9A-Fa-f]+)\s*\+\s*\(\s*{source_var}\s*-\s*"
67+
rf"(0x[0-9A-Fa-f]+)\s*\)\s*\)"
68+
)
69+
70+
pairs = []
71+
for match in range_pattern.finditer(function_body):
72+
assign = assignment_pattern.search(match.group("body"))
73+
if not assign:
74+
continue
75+
76+
source_first = int(match.group(1), 16)
77+
source_last = int(match.group(2), 16)
78+
dest_base = int(assign.group(1), 16)
79+
source_base = int(assign.group(2), 16)
80+
81+
for source in range(source_first, source_last + 1):
82+
pairs.append((source, dest_base + (source - source_base)))
83+
return pairs
84+
85+
86+
def build_mappings(format_path):
87+
source = format_path.read_text()
88+
89+
evo_to_legacy_body = extract_function(source, "direct_legacy_token_for_evo")
90+
legacy_to_evo_body = extract_function(source, "direct_evo_token_for_legacy")
91+
92+
evo_to_legacy = dict(parse_range_maps(evo_to_legacy_body, "evoToken", "legacyToken"))
93+
evo_to_legacy.update(parse_direct_map(evo_to_legacy_body))
94+
95+
legacy_to_evo = dict(parse_range_maps(legacy_to_evo_body, "legacyToken", "evoToken"))
96+
legacy_to_evo.update(parse_direct_map(legacy_to_evo_body))
97+
98+
return evo_to_legacy, legacy_to_evo
99+
100+
101+
def print_section(title, rows):
102+
print(f"{title}: {len(rows)}")
103+
for token, name in rows:
104+
print(f" {hex_word(token)} {name}")
105+
106+
107+
def compact_ranges(rows):
108+
if not rows:
109+
return []
110+
111+
grouped = []
112+
start_token, start_name = rows[0]
113+
prev_token, prev_name = rows[0]
114+
for token, name in rows[1:]:
115+
if token == prev_token + 1:
116+
prev_token, prev_name = token, name
117+
continue
118+
119+
grouped.append((start_token, start_name, prev_token, prev_name))
120+
start_token, start_name = token, name
121+
prev_token, prev_name = token, name
122+
123+
grouped.append((start_token, start_name, prev_token, prev_name))
124+
return grouped
125+
126+
127+
def print_range_section(title, rows):
128+
print(f"{title}: {len(rows)} tokens in {len(compact_ranges(rows))} ranges")
129+
for first_token, first_name, last_token, last_name in compact_ranges(rows):
130+
if first_token == last_token:
131+
print(f" {hex_word(first_token)} {first_name}")
132+
else:
133+
print(f" {hex_word(first_token)}..{hex_word(last_token)} {first_name}..{last_name}")
134+
135+
136+
def main():
137+
parser = argparse.ArgumentParser(
138+
description="Check EvoTokens.inc token coverage in CE<->Evo token conversion mappings."
139+
)
140+
parser.add_argument(
141+
"--include-eos",
142+
action="store_true",
143+
help="Include TOK_EOS in the check. It is skipped by default because tokenized Evo conversion stops at EOS.",
144+
)
145+
parser.add_argument(
146+
"--ranges",
147+
action="store_true",
148+
help="Print missing tokens as compact contiguous ranges instead of one token per line.",
149+
)
150+
args = parser.parse_args()
151+
152+
tokens = parse_evo_tokens(EVO_TOKENS)
153+
if not args.include_eos:
154+
tokens = [(token, name) for token, name in tokens if token != 0x0000]
155+
156+
evo_to_legacy, legacy_to_evo = build_mappings(EVO_FORMAT)
157+
legacy_to_evo_values = set(legacy_to_evo.values())
158+
159+
missing_evo_to_legacy = [
160+
(token, name) for token, name in tokens if token not in evo_to_legacy
161+
]
162+
missing_legacy_to_evo = [
163+
(token, name) for token, name in tokens if token not in legacy_to_evo_values
164+
]
165+
166+
print(f"Tokens checked: {len(tokens)}")
167+
print(f"Evo->CE mapped Evo tokens: {len(evo_to_legacy)}")
168+
print(f"CE->Evo produced Evo tokens: {len(legacy_to_evo_values)}")
169+
print()
170+
section_printer = print_range_section if args.ranges else print_section
171+
section_printer("Missing from Evo->CE mapping", missing_evo_to_legacy)
172+
print()
173+
section_printer("Missing from CE->Evo mapping", missing_legacy_to_evo)
174+
175+
return 1 if missing_evo_to_legacy or missing_legacy_to_evo else 0
176+
177+
178+
if __name__ == "__main__":
179+
raise SystemExit(main())

0 commit comments

Comments
 (0)