Skip to content

Commit bceaa0c

Browse files
committed
Check working-tree-encoding issues
1 parent 54dc2e2 commit bceaa0c

3 files changed

Lines changed: 117 additions & 0 deletions

File tree

action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ inputs:
7575
runs:
7676
using: "composite"
7777
steps:
78+
- name: Check working-tree-encoding integrity
79+
shell: bash
80+
run: $GITHUB_ACTION_PATH/action/check-encoding.py
81+
7882
- name: Install uv
7983
uses: astral-sh/setup-uv@v7
8084

action/check-encoding.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python3
2+
3+
"""Fail if any file with a `working-tree-encoding` gitattribute is not stored as UTF-8 in Git.
4+
5+
Git keeps working-tree-encoding content as UTF-8 in the blob and converts to the declared
6+
encoding only on checkout. Files uploaded via the GitHub web UI / contents API bypass Git's
7+
clean filter, so their raw legacy bytes get committed as the blob. On later checkout Git can no
8+
longer convert UTF-8 -> the declared encoding, logs "error: failed to encode ..." and yet exits
9+
0 - so the corruption is silent and the build stays green.
10+
11+
This validates the attributed blobs without a checkout: it streams them out of Git and runs the
12+
same UTF-8 -> declared-encoding conversion through `iconv`, which is the converter Git itself
13+
uses (a plain table codec would, for instance, wrongly reject valid cp1258 Vietnamese text that
14+
Git accepts via combining-diacritic decomposition). Files are grouped by encoding so a whole
15+
group converts in a single iconv pass; only a group that fails is then probed file by file.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import subprocess
21+
import sys
22+
from collections import defaultdict
23+
from collections.abc import Iterator
24+
25+
# working-tree-encoding values that need no UTF-8 -> legacy conversion, so they cannot fail.
26+
SKIP = {"", "unspecified", "unset", "set", "utf-8", "utf8"}
27+
28+
29+
def git(*args: str, stdin: bytes | None = None) -> bytes:
30+
return subprocess.run(["git", *args], input=stdin, capture_output=True, check=True).stdout
31+
32+
33+
def cat_file_batch(oids: list[str]) -> bytes:
34+
"""Raw `git cat-file --batch` output (per blob: a header line, the bytes, a newline)."""
35+
return git("cat-file", "--batch", stdin="".join(oid + "\n" for oid in oids).encode("ascii"))
36+
37+
38+
def split_blobs(batch: bytes) -> Iterator[bytes]:
39+
"""Yield each blob's bytes from cat-file --batch output, in request order."""
40+
pos = 0
41+
while pos < len(batch):
42+
newline = batch.index(b"\n", pos)
43+
size = int(batch[pos:newline].split()[2]) # header: "<oid> blob <size>"
44+
start = newline + 1
45+
yield batch[start : start + size]
46+
pos = start + size + 1 # skip the blob and its trailing newline
47+
48+
49+
def iconv_ok(data: bytes, encoding: str) -> bool:
50+
"""True if `data` converts cleanly from UTF-8 to `encoding` - Git's own conversion path."""
51+
result = subprocess.run(
52+
["iconv", "-f", "UTF-8", "-t", encoding],
53+
input=data,
54+
stdout=subprocess.DEVNULL,
55+
stderr=subprocess.DEVNULL,
56+
)
57+
return result.returncode == 0
58+
59+
60+
def main() -> int:
61+
# path -> blob oid for every tracked file ("<mode> <oid> <stage>\t<path>", NUL-separated).
62+
oids: dict[str, str] = {}
63+
for entry in git("ls-files", "-s", "-z").split(b"\0"):
64+
if not entry:
65+
continue
66+
meta, _, path = entry.partition(b"\t")
67+
oids[path.decode("utf-8", "surrogateescape")] = meta.split(b" ")[1].decode("ascii")
68+
69+
# Group the attributed files by their declared encoding.
70+
paths_z = "\0".join(oids).encode("utf-8", "surrogateescape") + b"\0"
71+
fields = git("check-attr", "--stdin", "-z", "working-tree-encoding", stdin=paths_z).split(b"\0")
72+
groups: dict[str, list[tuple[str, str]]] = defaultdict(list) # encoding -> [(path, oid)]
73+
for i in range(0, len(fields) - 2, 3):
74+
path_b, value_b = fields[i], fields[i + 2]
75+
if not path_b:
76+
continue
77+
encoding = value_b.decode("ascii", "replace")
78+
if encoding.lower() in SKIP:
79+
continue
80+
path = path_b.decode("utf-8", "surrogateescape")
81+
groups[encoding].append((path, oids[path]))
82+
83+
if not groups:
84+
print("Encoding check: no legacy working-tree-encoding files.")
85+
return 0
86+
87+
bad: list[str] = []
88+
for encoding, items in groups.items():
89+
batch = cat_file_batch([oid for _, oid in items])
90+
if iconv_ok(batch, encoding):
91+
continue
92+
# The group failed somewhere; find the offending files.
93+
bad.extend(path for (path, _), blob in zip(items, split_blobs(batch)) if not iconv_ok(blob, encoding))
94+
95+
if bad:
96+
print("::error::Files with a working-tree-encoding attribute are not stored as UTF-8 in Git.")
97+
print("They were most likely uploaded via the GitHub web UI/API, which bypasses Git's encoding filter.")
98+
print("Fix: re-commit them from a local clone (git add --renormalize <files>), or upload UTF-8 bytes.")
99+
print("Affected files:")
100+
for path in sorted(bad):
101+
print(f" {path}")
102+
return 1
103+
104+
print("Encoding check passed: all working-tree-encoding files are stored as UTF-8 in Git.")
105+
return 0
106+
107+
108+
if __name__ == "__main__":
109+
sys.exit(main())

docs/changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
## Changelog
22

3+
### WIP
4+
5+
- Action: now fails the run if any file with a `working-tree-encoding` gitattribute is not stored as UTF-8 in Git. Files uploaded through the GitHub web UI or contents API bypass Git's encoding filter and commit raw legacy bytes, which Git silently flags as `failed to encode` on checkout without failing the build. The check surfaces the affected files so they can be re-committed correctly.
6+
37
### 1.5.3
48

59
- `msgmerge-female` now strips `\r` from CRLF line endings in PO and POT files before invoking gettext `msgmerge`. Working trees produced by Git's `autocrlf=true` on Windows would otherwise produce mixed-ending output and spurious fuzzy/duplicate entries.

0 commit comments

Comments
 (0)