Skip to content

Commit bd75483

Browse files
Matt CoralloTheBlueMatt
authored andcommitted
Add an auto-generated unicode character category file
1a01b5a added detection of unicode format characters in `PrintableString`, but used a hard-coded table which may eventually become out of date. Here we switch to an auto-generated table, include all `General_Category` `Other` characters, and also ban unallocated code points. Finally, CI validates that the file is kept up to date. Written by Claude
1 parent d12f9ea commit bd75483

5 files changed

Lines changed: 1118 additions & 34 deletions

File tree

.github/workflows/build.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,11 @@ jobs:
313313
- name: Run default clippy linting
314314
run: |
315315
./ci/check-lint.sh
316+
- name: Check Unicode general-category table is up to date
317+
run: |
318+
curl --proto '=https' --tlsv1.2 -fsSL -o /tmp/UnicodeData.txt https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
319+
contrib/gen_unicode_general_category.py /tmp/UnicodeData.txt -o /tmp/unicode.rs
320+
diff -u lightning-types/src/unicode.rs /tmp/unicode.rs
316321
317322
rustfmt:
318323
runs-on: ubuntu-latest
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
#!/usr/bin/env python3
2+
# This file is Copyright its original authors, visible in version control
3+
# history.
4+
#
5+
# This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE
6+
# or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option.
8+
# You may not use this file except in accordance with one or both of these
9+
# licenses.
10+
11+
"""Generate Unicode general-category predicates from `UnicodeData.txt`.
12+
13+
Emits two `pub(crate)` functions taking a `char`, split into two disjoint
14+
buckets across the Unicode top-level `C` ("Other") category so callers can
15+
compose them:
16+
17+
is_unicode_general_category_other — Cc / Cf / Cs / Co (assigned)
18+
is_unicode_general_category_unassigned — Cn (plus codepoints above
19+
U+10FFFF, which aren't
20+
valid codepoints at all)
21+
22+
`UnicodeData.txt` is the canonical machine-readable listing of every assigned
23+
codepoint in the Unicode Character Database. Each line is `;`-separated; field
24+
0 is the codepoint (hex), field 1 is the name, and field 2 is the two-letter
25+
general category (e.g. `Lu`, `Cf`, `Mn`). Codepoints absent from the file have
26+
category `Cn` (Unassigned) by convention.
27+
28+
Two encoding details to preserve:
29+
* Large blocks of contiguous same-category codepoints are written as two
30+
consecutive entries whose names end in `, First>` and `, Last>`. Every
31+
codepoint between First and Last (inclusive) shares the listed category.
32+
* The codepoint range is U+0000..=U+10FFFF.
33+
34+
Each `matches!` arm in the assigned-Other table carries an end-of-line comment
35+
derived from the `UnicodeData.txt` name field — typically the longest common
36+
word prefix or suffix across the names in the range, falling back to the set
37+
of categories when the names share nothing meaningful. The unassigned table
38+
omits per-arm comments since every range there has the same meaning by
39+
construction.
40+
41+
Usage:
42+
contrib/gen_unicode_general_category.py UnicodeData.txt > out.rs
43+
"""
44+
45+
import argparse
46+
import sys
47+
from pathlib import Path
48+
49+
MAX_CODEPOINT = 0x10FFFF
50+
51+
LICENSE_HEADER = """\
52+
// This file is Copyright its original authors, visible in version control
53+
// history.
54+
//
55+
// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE
56+
// or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
57+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option.
58+
// You may not use this file except in accordance with one or both of these
59+
// licenses.
60+
"""
61+
62+
GENERATED_NOTICE = """\
63+
// Auto-generated from the Unicode Character Database (UnicodeData.txt) by
64+
// contrib/gen_unicode_general_category.py. Do not edit by hand; rerun the
65+
// generator with an updated UnicodeData.txt to refresh the table.
66+
"""
67+
68+
69+
def _normalize_name(name):
70+
"""Strip the `<...>` wrapping and `, First` / `, Last` range markers so
71+
that, e.g., `<Non Private Use High Surrogate, First>` becomes
72+
`Non Private Use High Surrogate` and `<control>` becomes `control`.
73+
"""
74+
if name.startswith("<") and name.endswith(">"):
75+
inner = name[1:-1]
76+
for suffix in (", First", ", Last"):
77+
if inner.endswith(suffix):
78+
inner = inner[: -len(suffix)]
79+
return inner
80+
return name
81+
82+
83+
def parse_categories(path):
84+
"""Return `(cats, names)` mapping every codepoint listed in `path` to its
85+
general category and to its (normalised) name. Codepoints absent from the
86+
returned dicts have category `Cn` (Unassigned) and no name.
87+
"""
88+
cats = {}
89+
names = {}
90+
pending_first = None # (first_cp, first_cat, normalised_name) once a range opens.
91+
with path.open() as f:
92+
for lineno, raw in enumerate(f, 1):
93+
line = raw.rstrip("\n")
94+
if not line:
95+
continue
96+
fields = line.split(";")
97+
if len(fields) < 3:
98+
raise ValueError(f"{path}:{lineno}: expected at least 3 fields, got {len(fields)}")
99+
cp = int(fields[0], 16)
100+
name = fields[1]
101+
cat = fields[2]
102+
if pending_first is not None:
103+
first_cp, first_cat, first_name = pending_first
104+
if not name.endswith(", Last>"):
105+
raise ValueError(
106+
f"{path}:{lineno}: expected `, Last>` to close range "
107+
f"opened at U+{first_cp:04X}, got name {name!r}"
108+
)
109+
if cat != first_cat:
110+
raise ValueError(
111+
f"{path}:{lineno}: range U+{first_cp:04X}..=U+{cp:04X} "
112+
f"has mismatched categories {first_cat!r} / {cat!r}"
113+
)
114+
for x in range(first_cp, cp + 1):
115+
cats[x] = cat
116+
names[x] = first_name
117+
pending_first = None
118+
elif name.endswith(", First>"):
119+
pending_first = (cp, cat, _normalize_name(name))
120+
else:
121+
cats[cp] = cat
122+
names[cp] = _normalize_name(name)
123+
if pending_first is not None:
124+
raise ValueError(f"{path}: dangling `, First>` entry at U+{pending_first[0]:04X}")
125+
return cats, names
126+
127+
128+
ASSIGNED_OTHER_CATS = frozenset({"Cc", "Cf", "Cs", "Co"})
129+
130+
131+
def coalesce_ranges(cats, names, target_cats, *, label):
132+
"""Walk U+0000..=U+10FFFF and return a list of `(start, end, label)` for
133+
every contiguous run of codepoints whose general category is in
134+
`target_cats`. Codepoints absent from `cats` are treated as `Cn`.
135+
136+
If `label` is `True`, attach a comment summarising the codepoint names in
137+
each range; otherwise every range gets an empty label.
138+
"""
139+
ranges = []
140+
start = None
141+
for cp in range(MAX_CODEPOINT + 1):
142+
in_target = cats.get(cp, "Cn") in target_cats
143+
if in_target and start is None:
144+
start = cp
145+
elif not in_target and start is not None:
146+
ranges.append((start, cp - 1))
147+
start = None
148+
if start is not None:
149+
ranges.append((start, MAX_CODEPOINT))
150+
151+
if not label:
152+
return [(s, e, "") for s, e in ranges]
153+
154+
labelled = []
155+
for s, e in ranges:
156+
range_names = []
157+
range_cats = set()
158+
for cp in range(s, e + 1):
159+
range_cats.add(cats.get(cp, "Cn"))
160+
n = names.get(cp)
161+
if n is not None:
162+
range_names.append(n)
163+
labelled.append((s, e, _make_label(range_names, range_cats)))
164+
return labelled
165+
166+
167+
def _common_word_run(names, *, from_end):
168+
"""Return the longest sequence of words shared by every name, taken from
169+
either the start (`from_end=False`) or the end (`from_end=True`) of each
170+
name's whitespace-split tokens.
171+
"""
172+
if not names:
173+
return ""
174+
tokenised = [n.split() for n in names]
175+
if from_end:
176+
tokenised = [list(reversed(t)) for t in tokenised]
177+
limit = min(len(t) for t in tokenised)
178+
common = []
179+
for i in range(limit):
180+
token = tokenised[0][i]
181+
if all(t[i] == token for t in tokenised):
182+
common.append(token)
183+
else:
184+
break
185+
if from_end:
186+
common.reverse()
187+
return " ".join(common)
188+
189+
190+
def _make_label(names, cats_in_range):
191+
"""Build a short human-readable label for a coalesced range. Applied to
192+
the assigned-Other buckets only; each range there is `Cc`, `Cf`, `Cs`,
193+
`Co`, or some contiguous union thereof.
194+
195+
Rules, in order:
196+
1. All names identical → that name (e.g. `control`).
197+
2. Common leading or trailing words → the longer of the two.
198+
3. Otherwise, list the categories present (e.g. `Co / Cs`).
199+
"""
200+
unique = list(dict.fromkeys(names))
201+
if len(unique) == 1:
202+
return unique[0]
203+
204+
prefix = _common_word_run(names, from_end=False)
205+
suffix = _common_word_run(names, from_end=True)
206+
# Pick whichever is more informative; when both are non-empty, prefer the
207+
# longer one. A multi-word prefix beats a single-word suffix.
208+
label = prefix if len(prefix) >= len(suffix) else suffix
209+
if label:
210+
return label
211+
return " / ".join(sorted(cats_in_range))
212+
213+
214+
def fmt_codepoint(cp):
215+
# `UnicodeData.txt` uses 4-digit hex for the BMP and wider for higher
216+
# planes; mirror that so the output stays readable next to the source data.
217+
return f"0x{cp:04X}" if cp <= 0xFFFF else f"0x{cp:X}"
218+
219+
220+
def _pattern(start, end):
221+
if start == end:
222+
return fmt_codepoint(start)
223+
return f"{fmt_codepoint(start)}..={fmt_codepoint(end)}"
224+
225+
226+
def _emit_matches_body(lines, arms):
227+
"""Append a `matches!(c as u32, ...)` body to `lines`, with one
228+
`(pattern, label)` tuple per arm. The first arm sits at the `matches!`
229+
argument indent and continuation `| ...` arms indent one level deeper,
230+
matching the rustfmt convention used elsewhere in the tree.
231+
"""
232+
lines.append("\tmatches!(")
233+
lines.append("\t\tc as u32,")
234+
for i, (pattern, label) in enumerate(arms):
235+
prefix = "\t\t" if i == 0 else "\t\t\t| "
236+
comment = f" // {label}" if label else ""
237+
lines.append(f"{prefix}{pattern}{comment}")
238+
lines.append("\t)")
239+
240+
241+
def render_rust(other_ranges, unassigned_ranges):
242+
"""Render the final Rust source defining both `char`-taking predicates.
243+
244+
`other_ranges` and `unassigned_ranges` are lists of `(start, end, label)`.
245+
The unassigned function additionally gets a synthetic final arm catching
246+
`u32` values above U+10FFFF — these aren't valid Unicode codepoints, so
247+
by definition they have no general category and the unassigned bucket is
248+
the closest match.
249+
"""
250+
lines = [LICENSE_HEADER, GENERATED_NOTICE]
251+
252+
lines.append("/// Returns `true` if `c` is in Unicode general category `Cc` (Control), `Cf`")
253+
lines.append("/// (Format), `Cs` (Surrogate), or `Co` (Private Use) — the assigned codepoints")
254+
lines.append("/// in the top-level `C` (\"Other\") category. The `Cs` portion of the table is")
255+
lines.append("/// unreachable for `char` input (a `char` cannot hold a surrogate) but is kept")
256+
lines.append("/// so the table mirrors the source UCD data verbatim. The disjoint `Cn`")
257+
lines.append("/// (Unassigned) bucket is `is_unicode_general_category_unassigned`.")
258+
lines.append("#[allow(dead_code)]")
259+
lines.append("pub(crate) fn is_unicode_general_category_other(c: char) -> bool {")
260+
other_arms = [(_pattern(s, e), label) for s, e, label in other_ranges]
261+
_emit_matches_body(lines, other_arms)
262+
lines.append("}")
263+
lines.append("")
264+
265+
lines.append("/// Returns `true` if `c` is in Unicode general category `Cn` (Unassigned), or")
266+
lines.append("/// strictly above U+10FFFF. The trailing `0x110000..=u32::MAX` arm is")
267+
lines.append("/// unreachable for `char` input (a `char` is bounded to U+10FFFF) but is kept")
268+
lines.append("/// for defensive coverage of the underlying `u32`. The disjoint Cc / Cf / Cs /")
269+
lines.append("/// Co bucket is `is_unicode_general_category_other`.")
270+
lines.append("#[allow(dead_code)]")
271+
lines.append("pub(crate) fn is_unicode_general_category_unassigned(c: char) -> bool {")
272+
unassigned_arms = [(_pattern(s, e), label) for s, e, label in unassigned_ranges]
273+
unassigned_arms.append(("0x110000..=u32::MAX", "above U+10FFFF — unreachable for `char`"))
274+
_emit_matches_body(lines, unassigned_arms)
275+
lines.append("}")
276+
lines.append("")
277+
278+
return "\n".join(lines)
279+
280+
281+
def main(argv):
282+
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
283+
ap.add_argument("unicode_data", type=Path, help="Path to UnicodeData.txt")
284+
ap.add_argument(
285+
"-o", "--output", type=Path, default=None,
286+
help="Output Rust file (default: stdout)",
287+
)
288+
args = ap.parse_args(argv)
289+
290+
cats, names = parse_categories(args.unicode_data)
291+
other = coalesce_ranges(cats, names, ASSIGNED_OTHER_CATS, label=True)
292+
unassigned = coalesce_ranges(cats, names, frozenset({"Cn"}), label=False)
293+
rust = render_rust(other, unassigned)
294+
295+
if args.output is None:
296+
sys.stdout.write(rust)
297+
else:
298+
args.output.write_text(rust)
299+
print(
300+
f"Wrote {args.output} "
301+
f"({len(other)} assigned-Other ranges, "
302+
f"{len(unassigned)} unassigned ranges).",
303+
file=sys.stderr,
304+
)
305+
306+
307+
if __name__ == "__main__":
308+
main(sys.argv[1:])

lightning-types/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ pub mod features;
2727
pub mod payment;
2828
pub mod routing;
2929
pub mod string;
30+
mod unicode;

lightning-types/src/string.rs

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
use alloc::string::String;
1313
use core::fmt;
1414

15+
use crate::unicode::*;
16+
1517
/// Struct to `Display` fields in a safe way using `PrintableString`
1618
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
1719
pub struct UntrustedString(pub String);
@@ -31,7 +33,9 @@ impl<'a> fmt::Display for PrintableString<'a> {
3133
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
3234
use core::fmt::Write;
3335
for c in self.0.chars() {
34-
let c = if c.is_control() || is_format_char(c) {
36+
let is_other = is_unicode_general_category_other(c);
37+
let is_unassigned = is_unicode_general_category_unassigned(c);
38+
let c = if c.is_control() || is_other || is_unassigned {
3539
core::char::REPLACEMENT_CHARACTER
3640
} else {
3741
c
@@ -43,39 +47,6 @@ impl<'a> fmt::Display for PrintableString<'a> {
4347
}
4448
}
4549

46-
// Codepoints in Unicode general category `Cf` (Format), per Unicode standard. These are not
47-
// matched by `char::is_control` (which only covers `Cc`), but include the bidirectional override /
48-
// isolate controls (e.g. U+202E RLO) and zero-width characters behind the "Trojan Source" attack
49-
// family (CVE-2021-42574), where an attacker-supplied string renders to a human reader as
50-
// something other than its byte content. Strip them alongside `Cc` characters when sanitising
51-
// untrusted input.
52-
fn is_format_char(c: char) -> bool {
53-
matches!(
54-
c as u32,
55-
0x00AD
56-
| 0x0600..=0x0605
57-
| 0x061C
58-
| 0x06DD
59-
| 0x070F
60-
| 0x0890..=0x0891
61-
| 0x08E2
62-
| 0x180E
63-
| 0x200B..=0x200F
64-
| 0x202A..=0x202E
65-
| 0x2060..=0x2064
66-
| 0x2066..=0x206F
67-
| 0xFEFF
68-
| 0xFFF9..=0xFFFB
69-
| 0x110BD
70-
| 0x110CD
71-
| 0x13430..=0x1343F
72-
| 0x1BCA0..=0x1BCA3
73-
| 0x1D173..=0x1D17A
74-
| 0xE0001
75-
| 0xE0020..=0xE007F
76-
)
77-
}
78-
7950
#[cfg(test)]
8051
mod tests {
8152
use super::PrintableString;

0 commit comments

Comments
 (0)