|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# This file is Copyright its original authors, visible in version control |
| 3 | +# history. |
| 4 | +# |
| 5 | +# This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE |
| 6 | +# or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option. |
| 8 | +# You may not use this file except in accordance with one or both of these |
| 9 | +# licenses. |
| 10 | + |
| 11 | +"""Generate Unicode general-category predicates from `UnicodeData.txt`. |
| 12 | +
|
| 13 | +Emits two `pub(crate)` functions taking a `char`, split into two disjoint |
| 14 | +buckets across the Unicode top-level `C` ("Other") category so callers can |
| 15 | +compose them: |
| 16 | +
|
| 17 | + is_unicode_general_category_other — Cc / Cf / Cs / Co (assigned) |
| 18 | + is_unicode_general_category_unassigned — Cn (plus codepoints above |
| 19 | + U+10FFFF, which aren't |
| 20 | + valid codepoints at all) |
| 21 | +
|
| 22 | +`UnicodeData.txt` is the canonical machine-readable listing of every assigned |
| 23 | +codepoint in the Unicode Character Database. Each line is `;`-separated; field |
| 24 | +0 is the codepoint (hex), field 1 is the name, and field 2 is the two-letter |
| 25 | +general category (e.g. `Lu`, `Cf`, `Mn`). Codepoints absent from the file have |
| 26 | +category `Cn` (Unassigned) by convention. |
| 27 | +
|
| 28 | +Two encoding details to preserve: |
| 29 | + * Large blocks of contiguous same-category codepoints are written as two |
| 30 | + consecutive entries whose names end in `, First>` and `, Last>`. Every |
| 31 | + codepoint between First and Last (inclusive) shares the listed category. |
| 32 | + * The codepoint range is U+0000..=U+10FFFF. |
| 33 | +
|
| 34 | +Each `matches!` arm in the assigned-Other table carries an end-of-line comment |
| 35 | +derived from the `UnicodeData.txt` name field — typically the longest common |
| 36 | +word prefix or suffix across the names in the range, falling back to the set |
| 37 | +of categories when the names share nothing meaningful. The unassigned table |
| 38 | +omits per-arm comments since every range there has the same meaning by |
| 39 | +construction. |
| 40 | +
|
| 41 | +Usage: |
| 42 | + contrib/gen_unicode_general_category.py UnicodeData.txt > out.rs |
| 43 | +""" |
| 44 | + |
| 45 | +import argparse |
| 46 | +import sys |
| 47 | +from pathlib import Path |
| 48 | + |
| 49 | +MAX_CODEPOINT = 0x10FFFF |
| 50 | + |
| 51 | +LICENSE_HEADER = """\ |
| 52 | +// This file is Copyright its original authors, visible in version control |
| 53 | +// history. |
| 54 | +// |
| 55 | +// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE |
| 56 | +// or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 57 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option. |
| 58 | +// You may not use this file except in accordance with one or both of these |
| 59 | +// licenses. |
| 60 | +""" |
| 61 | + |
| 62 | +GENERATED_NOTICE = """\ |
| 63 | +// Auto-generated from the Unicode Character Database (UnicodeData.txt) by |
| 64 | +// contrib/gen_unicode_general_category.py. Do not edit by hand; rerun the |
| 65 | +// generator with an updated UnicodeData.txt to refresh the table. |
| 66 | +""" |
| 67 | + |
| 68 | + |
| 69 | +def _normalize_name(name): |
| 70 | + """Strip the `<...>` wrapping and `, First` / `, Last` range markers so |
| 71 | + that, e.g., `<Non Private Use High Surrogate, First>` becomes |
| 72 | + `Non Private Use High Surrogate` and `<control>` becomes `control`. |
| 73 | + """ |
| 74 | + if name.startswith("<") and name.endswith(">"): |
| 75 | + inner = name[1:-1] |
| 76 | + for suffix in (", First", ", Last"): |
| 77 | + if inner.endswith(suffix): |
| 78 | + inner = inner[: -len(suffix)] |
| 79 | + return inner |
| 80 | + return name |
| 81 | + |
| 82 | + |
| 83 | +def parse_categories(path): |
| 84 | + """Return `(cats, names)` mapping every codepoint listed in `path` to its |
| 85 | + general category and to its (normalised) name. Codepoints absent from the |
| 86 | + returned dicts have category `Cn` (Unassigned) and no name. |
| 87 | + """ |
| 88 | + cats = {} |
| 89 | + names = {} |
| 90 | + pending_first = None # (first_cp, first_cat, normalised_name) once a range opens. |
| 91 | + with path.open() as f: |
| 92 | + for lineno, raw in enumerate(f, 1): |
| 93 | + line = raw.rstrip("\n") |
| 94 | + if not line: |
| 95 | + continue |
| 96 | + fields = line.split(";") |
| 97 | + if len(fields) < 3: |
| 98 | + raise ValueError(f"{path}:{lineno}: expected at least 3 fields, got {len(fields)}") |
| 99 | + cp = int(fields[0], 16) |
| 100 | + name = fields[1] |
| 101 | + cat = fields[2] |
| 102 | + if pending_first is not None: |
| 103 | + first_cp, first_cat, first_name = pending_first |
| 104 | + if not name.endswith(", Last>"): |
| 105 | + raise ValueError( |
| 106 | + f"{path}:{lineno}: expected `, Last>` to close range " |
| 107 | + f"opened at U+{first_cp:04X}, got name {name!r}" |
| 108 | + ) |
| 109 | + if cat != first_cat: |
| 110 | + raise ValueError( |
| 111 | + f"{path}:{lineno}: range U+{first_cp:04X}..=U+{cp:04X} " |
| 112 | + f"has mismatched categories {first_cat!r} / {cat!r}" |
| 113 | + ) |
| 114 | + for x in range(first_cp, cp + 1): |
| 115 | + cats[x] = cat |
| 116 | + names[x] = first_name |
| 117 | + pending_first = None |
| 118 | + elif name.endswith(", First>"): |
| 119 | + pending_first = (cp, cat, _normalize_name(name)) |
| 120 | + else: |
| 121 | + cats[cp] = cat |
| 122 | + names[cp] = _normalize_name(name) |
| 123 | + if pending_first is not None: |
| 124 | + raise ValueError(f"{path}: dangling `, First>` entry at U+{pending_first[0]:04X}") |
| 125 | + return cats, names |
| 126 | + |
| 127 | + |
| 128 | +ASSIGNED_OTHER_CATS = frozenset({"Cc", "Cf", "Cs", "Co"}) |
| 129 | + |
| 130 | + |
| 131 | +def coalesce_ranges(cats, names, target_cats, *, label): |
| 132 | + """Walk U+0000..=U+10FFFF and return a list of `(start, end, label)` for |
| 133 | + every contiguous run of codepoints whose general category is in |
| 134 | + `target_cats`. Codepoints absent from `cats` are treated as `Cn`. |
| 135 | +
|
| 136 | + If `label` is `True`, attach a comment summarising the codepoint names in |
| 137 | + each range; otherwise every range gets an empty label. |
| 138 | + """ |
| 139 | + ranges = [] |
| 140 | + start = None |
| 141 | + for cp in range(MAX_CODEPOINT + 1): |
| 142 | + in_target = cats.get(cp, "Cn") in target_cats |
| 143 | + if in_target and start is None: |
| 144 | + start = cp |
| 145 | + elif not in_target and start is not None: |
| 146 | + ranges.append((start, cp - 1)) |
| 147 | + start = None |
| 148 | + if start is not None: |
| 149 | + ranges.append((start, MAX_CODEPOINT)) |
| 150 | + |
| 151 | + if not label: |
| 152 | + return [(s, e, "") for s, e in ranges] |
| 153 | + |
| 154 | + labelled = [] |
| 155 | + for s, e in ranges: |
| 156 | + range_names = [] |
| 157 | + range_cats = set() |
| 158 | + for cp in range(s, e + 1): |
| 159 | + range_cats.add(cats.get(cp, "Cn")) |
| 160 | + n = names.get(cp) |
| 161 | + if n is not None: |
| 162 | + range_names.append(n) |
| 163 | + labelled.append((s, e, _make_label(range_names, range_cats))) |
| 164 | + return labelled |
| 165 | + |
| 166 | + |
| 167 | +def _common_word_run(names, *, from_end): |
| 168 | + """Return the longest sequence of words shared by every name, taken from |
| 169 | + either the start (`from_end=False`) or the end (`from_end=True`) of each |
| 170 | + name's whitespace-split tokens. |
| 171 | + """ |
| 172 | + if not names: |
| 173 | + return "" |
| 174 | + tokenised = [n.split() for n in names] |
| 175 | + if from_end: |
| 176 | + tokenised = [list(reversed(t)) for t in tokenised] |
| 177 | + limit = min(len(t) for t in tokenised) |
| 178 | + common = [] |
| 179 | + for i in range(limit): |
| 180 | + token = tokenised[0][i] |
| 181 | + if all(t[i] == token for t in tokenised): |
| 182 | + common.append(token) |
| 183 | + else: |
| 184 | + break |
| 185 | + if from_end: |
| 186 | + common.reverse() |
| 187 | + return " ".join(common) |
| 188 | + |
| 189 | + |
| 190 | +def _make_label(names, cats_in_range): |
| 191 | + """Build a short human-readable label for a coalesced range. Applied to |
| 192 | + the assigned-Other buckets only; each range there is `Cc`, `Cf`, `Cs`, |
| 193 | + `Co`, or some contiguous union thereof. |
| 194 | +
|
| 195 | + Rules, in order: |
| 196 | + 1. All names identical → that name (e.g. `control`). |
| 197 | + 2. Common leading or trailing words → the longer of the two. |
| 198 | + 3. Otherwise, list the categories present (e.g. `Co / Cs`). |
| 199 | + """ |
| 200 | + unique = list(dict.fromkeys(names)) |
| 201 | + if len(unique) == 1: |
| 202 | + return unique[0] |
| 203 | + |
| 204 | + prefix = _common_word_run(names, from_end=False) |
| 205 | + suffix = _common_word_run(names, from_end=True) |
| 206 | + # Pick whichever is more informative; when both are non-empty, prefer the |
| 207 | + # longer one. A multi-word prefix beats a single-word suffix. |
| 208 | + label = prefix if len(prefix) >= len(suffix) else suffix |
| 209 | + if label: |
| 210 | + return label |
| 211 | + return " / ".join(sorted(cats_in_range)) |
| 212 | + |
| 213 | + |
| 214 | +def fmt_codepoint(cp): |
| 215 | + # `UnicodeData.txt` uses 4-digit hex for the BMP and wider for higher |
| 216 | + # planes; mirror that so the output stays readable next to the source data. |
| 217 | + return f"0x{cp:04X}" if cp <= 0xFFFF else f"0x{cp:X}" |
| 218 | + |
| 219 | + |
| 220 | +def _pattern(start, end): |
| 221 | + if start == end: |
| 222 | + return fmt_codepoint(start) |
| 223 | + return f"{fmt_codepoint(start)}..={fmt_codepoint(end)}" |
| 224 | + |
| 225 | + |
| 226 | +def _emit_matches_body(lines, arms): |
| 227 | + """Append a `matches!(c as u32, ...)` body to `lines`, with one |
| 228 | + `(pattern, label)` tuple per arm. The first arm sits at the `matches!` |
| 229 | + argument indent and continuation `| ...` arms indent one level deeper, |
| 230 | + matching the rustfmt convention used elsewhere in the tree. |
| 231 | + """ |
| 232 | + lines.append("\tmatches!(") |
| 233 | + lines.append("\t\tc as u32,") |
| 234 | + for i, (pattern, label) in enumerate(arms): |
| 235 | + prefix = "\t\t" if i == 0 else "\t\t\t| " |
| 236 | + comment = f" // {label}" if label else "" |
| 237 | + lines.append(f"{prefix}{pattern}{comment}") |
| 238 | + lines.append("\t)") |
| 239 | + |
| 240 | + |
| 241 | +def render_rust(other_ranges, unassigned_ranges): |
| 242 | + """Render the final Rust source defining both `char`-taking predicates. |
| 243 | +
|
| 244 | + `other_ranges` and `unassigned_ranges` are lists of `(start, end, label)`. |
| 245 | + The unassigned function additionally gets a synthetic final arm catching |
| 246 | + `u32` values above U+10FFFF — these aren't valid Unicode codepoints, so |
| 247 | + by definition they have no general category and the unassigned bucket is |
| 248 | + the closest match. |
| 249 | + """ |
| 250 | + lines = [LICENSE_HEADER, GENERATED_NOTICE] |
| 251 | + |
| 252 | + lines.append("/// Returns `true` if `c` is in Unicode general category `Cc` (Control), `Cf`") |
| 253 | + lines.append("/// (Format), `Cs` (Surrogate), or `Co` (Private Use) — the assigned codepoints") |
| 254 | + lines.append("/// in the top-level `C` (\"Other\") category. The `Cs` portion of the table is") |
| 255 | + lines.append("/// unreachable for `char` input (a `char` cannot hold a surrogate) but is kept") |
| 256 | + lines.append("/// so the table mirrors the source UCD data verbatim. The disjoint `Cn`") |
| 257 | + lines.append("/// (Unassigned) bucket is `is_unicode_general_category_unassigned`.") |
| 258 | + lines.append("#[allow(dead_code)]") |
| 259 | + lines.append("pub(crate) fn is_unicode_general_category_other(c: char) -> bool {") |
| 260 | + other_arms = [(_pattern(s, e), label) for s, e, label in other_ranges] |
| 261 | + _emit_matches_body(lines, other_arms) |
| 262 | + lines.append("}") |
| 263 | + lines.append("") |
| 264 | + |
| 265 | + lines.append("/// Returns `true` if `c` is in Unicode general category `Cn` (Unassigned), or") |
| 266 | + lines.append("/// strictly above U+10FFFF. The trailing `0x110000..=u32::MAX` arm is") |
| 267 | + lines.append("/// unreachable for `char` input (a `char` is bounded to U+10FFFF) but is kept") |
| 268 | + lines.append("/// for defensive coverage of the underlying `u32`. The disjoint Cc / Cf / Cs /") |
| 269 | + lines.append("/// Co bucket is `is_unicode_general_category_other`.") |
| 270 | + lines.append("#[allow(dead_code)]") |
| 271 | + lines.append("pub(crate) fn is_unicode_general_category_unassigned(c: char) -> bool {") |
| 272 | + unassigned_arms = [(_pattern(s, e), label) for s, e, label in unassigned_ranges] |
| 273 | + unassigned_arms.append(("0x110000..=u32::MAX", "above U+10FFFF — unreachable for `char`")) |
| 274 | + _emit_matches_body(lines, unassigned_arms) |
| 275 | + lines.append("}") |
| 276 | + lines.append("") |
| 277 | + |
| 278 | + return "\n".join(lines) |
| 279 | + |
| 280 | + |
| 281 | +def main(argv): |
| 282 | + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) |
| 283 | + ap.add_argument("unicode_data", type=Path, help="Path to UnicodeData.txt") |
| 284 | + ap.add_argument( |
| 285 | + "-o", "--output", type=Path, default=None, |
| 286 | + help="Output Rust file (default: stdout)", |
| 287 | + ) |
| 288 | + args = ap.parse_args(argv) |
| 289 | + |
| 290 | + cats, names = parse_categories(args.unicode_data) |
| 291 | + other = coalesce_ranges(cats, names, ASSIGNED_OTHER_CATS, label=True) |
| 292 | + unassigned = coalesce_ranges(cats, names, frozenset({"Cn"}), label=False) |
| 293 | + rust = render_rust(other, unassigned) |
| 294 | + |
| 295 | + if args.output is None: |
| 296 | + sys.stdout.write(rust) |
| 297 | + else: |
| 298 | + args.output.write_text(rust) |
| 299 | + print( |
| 300 | + f"Wrote {args.output} " |
| 301 | + f"({len(other)} assigned-Other ranges, " |
| 302 | + f"{len(unassigned)} unassigned ranges).", |
| 303 | + file=sys.stderr, |
| 304 | + ) |
| 305 | + |
| 306 | + |
| 307 | +if __name__ == "__main__": |
| 308 | + main(sys.argv[1:]) |
0 commit comments