|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +"""Fail if any file with a `working-tree-encoding` gitattribute is not stored as UTF-8 in Git. |
| 4 | +
|
| 5 | +Git keeps working-tree-encoding content as UTF-8 in the blob and converts to the declared |
| 6 | +encoding only on checkout. Files uploaded via the GitHub web UI / contents API bypass Git's |
| 7 | +clean filter, so their raw legacy bytes get committed as the blob. On later checkout Git can no |
| 8 | +longer convert UTF-8 -> the declared encoding, logs "error: failed to encode ..." and yet exits |
| 9 | +0 - so the corruption is silent and the build stays green. |
| 10 | +
|
| 11 | +This validates the attributed blobs without a checkout: it streams them out of Git and runs the |
| 12 | +same UTF-8 -> declared-encoding conversion through `iconv`, which is the converter Git itself |
| 13 | +uses (a plain table codec would, for instance, wrongly reject valid cp1258 Vietnamese text that |
| 14 | +Git accepts via combining-diacritic decomposition). Files are grouped by encoding so a whole |
| 15 | +group converts in a single iconv pass; only a group that fails is then probed file by file. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import subprocess |
| 21 | +import sys |
| 22 | +from collections import defaultdict |
| 23 | +from collections.abc import Iterator |
| 24 | + |
| 25 | +# working-tree-encoding values that need no UTF-8 -> legacy conversion, so they cannot fail. |
| 26 | +SKIP = {"", "unspecified", "unset", "set", "utf-8", "utf8"} |
| 27 | + |
| 28 | + |
| 29 | +def git(*args: str, stdin: bytes | None = None) -> bytes: |
| 30 | + return subprocess.run(["git", *args], input=stdin, capture_output=True, check=True).stdout |
| 31 | + |
| 32 | + |
| 33 | +def cat_file_batch(oids: list[str]) -> bytes: |
| 34 | + """Raw `git cat-file --batch` output (per blob: a header line, the bytes, a newline).""" |
| 35 | + return git("cat-file", "--batch", stdin="".join(oid + "\n" for oid in oids).encode("ascii")) |
| 36 | + |
| 37 | + |
| 38 | +def split_blobs(batch: bytes) -> Iterator[bytes]: |
| 39 | + """Yield each blob's bytes from cat-file --batch output, in request order.""" |
| 40 | + pos = 0 |
| 41 | + while pos < len(batch): |
| 42 | + newline = batch.index(b"\n", pos) |
| 43 | + size = int(batch[pos:newline].split()[2]) # header: "<oid> blob <size>" |
| 44 | + start = newline + 1 |
| 45 | + yield batch[start : start + size] |
| 46 | + pos = start + size + 1 # skip the blob and its trailing newline |
| 47 | + |
| 48 | + |
| 49 | +def iconv_ok(data: bytes, encoding: str) -> bool: |
| 50 | + """True if `data` converts cleanly from UTF-8 to `encoding` - Git's own conversion path.""" |
| 51 | + result = subprocess.run( |
| 52 | + ["iconv", "-f", "UTF-8", "-t", encoding], |
| 53 | + input=data, |
| 54 | + stdout=subprocess.DEVNULL, |
| 55 | + stderr=subprocess.DEVNULL, |
| 56 | + ) |
| 57 | + return result.returncode == 0 |
| 58 | + |
| 59 | + |
| 60 | +def main() -> int: |
| 61 | + # path -> blob oid for every tracked file ("<mode> <oid> <stage>\t<path>", NUL-separated). |
| 62 | + oids: dict[str, str] = {} |
| 63 | + for entry in git("ls-files", "-s", "-z").split(b"\0"): |
| 64 | + if not entry: |
| 65 | + continue |
| 66 | + meta, _, path = entry.partition(b"\t") |
| 67 | + oids[path.decode("utf-8", "surrogateescape")] = meta.split(b" ")[1].decode("ascii") |
| 68 | + |
| 69 | + # Group the attributed files by their declared encoding. |
| 70 | + paths_z = "\0".join(oids).encode("utf-8", "surrogateescape") + b"\0" |
| 71 | + fields = git("check-attr", "--stdin", "-z", "working-tree-encoding", stdin=paths_z).split(b"\0") |
| 72 | + groups: dict[str, list[tuple[str, str]]] = defaultdict(list) # encoding -> [(path, oid)] |
| 73 | + for i in range(0, len(fields) - 2, 3): |
| 74 | + path_b, value_b = fields[i], fields[i + 2] |
| 75 | + if not path_b: |
| 76 | + continue |
| 77 | + encoding = value_b.decode("ascii", "replace") |
| 78 | + if encoding.lower() in SKIP: |
| 79 | + continue |
| 80 | + path = path_b.decode("utf-8", "surrogateescape") |
| 81 | + groups[encoding].append((path, oids[path])) |
| 82 | + |
| 83 | + if not groups: |
| 84 | + print("Encoding check: no legacy working-tree-encoding files.") |
| 85 | + return 0 |
| 86 | + |
| 87 | + bad: list[str] = [] |
| 88 | + for encoding, items in groups.items(): |
| 89 | + batch = cat_file_batch([oid for _, oid in items]) |
| 90 | + if iconv_ok(batch, encoding): |
| 91 | + continue |
| 92 | + # The group failed somewhere; find the offending files. |
| 93 | + bad.extend(path for (path, _), blob in zip(items, split_blobs(batch)) if not iconv_ok(blob, encoding)) |
| 94 | + |
| 95 | + if bad: |
| 96 | + print("::error::Files with a working-tree-encoding attribute are not stored as UTF-8 in Git.") |
| 97 | + print("They were most likely uploaded via the GitHub web UI/API, which bypasses Git's encoding filter.") |
| 98 | + print("Fix: re-commit them from a local clone (git add --renormalize <files>), or upload UTF-8 bytes.") |
| 99 | + print("Affected files:") |
| 100 | + for path in sorted(bad): |
| 101 | + print(f" {path}") |
| 102 | + return 1 |
| 103 | + |
| 104 | + print("Encoding check passed: all working-tree-encoding files are stored as UTF-8 in Git.") |
| 105 | + return 0 |
| 106 | + |
| 107 | + |
| 108 | +if __name__ == "__main__": |
| 109 | + sys.exit(main()) |
0 commit comments