@@ -17,11 +17,43 @@ jobs:
1717 cache : pip
1818 - name : Install dependencies
1919 run : pip install -r requirements.txt
20- - name : Validate schema
20+
21+ - name : Validate schema (taxonomy + snapshot referential integrity)
2122 run : python scripts/validate_schema.py
22- - name : Rebuild matrices from snapshot.csv
23+
24+ - name : Build matrices (smoke test — script must run cleanly)
2325 run : python scripts/build_matrices.py
24- - name : Diff regenerated matrices against committed
26+
27+ - name : Compute validation (smoke test — script must run cleanly)
28+ run : python scripts/compute_validation.py
29+
30+ - name : Content-equality check on rebuilt CSV matrices
2531 run : |
26- git diff --exit-code classification/wide/ classification/long/ \
27- || (echo "Matrices out of sync with snapshot.csv — run scripts/build_matrices.py and commit" && exit 1)
32+ python - <<'EOF'
33+ import pandas as pd
34+ import subprocess, sys
35+
36+ # Compare each CSV matrix's content (not bytes) against the committed version.
37+ # Parquet is intentionally excluded — pyarrow output is not byte-stable across
38+ # environments even with identical input, so we trust CSV as the canonical
39+ # text form and require parquet only to deserialize correctly.
40+ mismatches = []
41+ for csv in [
42+ "classification/wide/class_code.csv",
43+ "classification/wide/sector_code.csv",
44+ "classification/wide/sub_sector_code.csv",
45+ "classification/wide/chain_ecosystem.csv",
46+ "classification/long/panel.csv",
47+ ]:
48+ committed = subprocess.check_output(["git", "show", f"HEAD:{csv}"]).decode("utf-8")
49+ with open(csv, encoding="utf-8") as f:
50+ rebuilt = f.read()
51+ if committed.strip() != rebuilt.strip():
52+ mismatches.append(csv)
53+ if mismatches:
54+ print(f"Out of sync (rebuild + commit needed):")
55+ for m in mismatches:
56+ print(f" - {m}")
57+ sys.exit(1)
58+ print(f"All {5} CSV artifacts in sync with snapshot.csv.")
59+ EOF
0 commit comments