-
Notifications
You must be signed in to change notification settings - Fork 0
83 lines (71 loc) · 3.16 KB
/
ci.yml
File metadata and controls
83 lines (71 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
name: CI
on:
pull_request:
branches: [main]
push:
branches: [main]
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
- name: Install dependencies
run: pip install -r requirements.txt
- name: Validate schema (taxonomy + snapshot referential integrity)
run: python scripts/validate_schema.py
- name: Build matrices (smoke test — script must run cleanly)
run: python scripts/build_matrices.py
- name: Compute validation (smoke test — script must run cleanly)
run: python scripts/compute_validation.py
- name: Dtype + index + NaN assertion
run: |
python - <<'EOF'
import pandas as pd
# wide index dtype
for col in ["class_code","sector_code","sub_sector_code"]:
w = pd.read_parquet(f"classification/wide/{col}.parquet")
assert str(w.index.dtype) == "datetime64[ns]", f"{col} wide index dtype: {w.index.dtype}"
dtypes = w.dtypes.unique()
assert len(dtypes) == 1 and str(dtypes[0]) == "Int64", f"{col} wide column dtypes: {dtypes}"
# panel column dtypes
panel = pd.read_parquet("classification/long/panel.parquet")
for col in ["class_code","sector_code","sub_sector_code"]:
assert str(panel[col].dtype) == "Int64", f"panel {col} dtype: {panel[col].dtype}"
# panel must have no NaN code rows (panel only stores existence rows)
nan_rows = panel[panel["sector_code"].isna()]
assert len(nan_rows) == 0, f"panel has {len(nan_rows)} NaN sector_code rows — PIT lookup broken"
print("[ci] dtype + index + NaN assertions passed")
EOF
- name: Content-equality check on rebuilt CSV matrices
run: |
python - <<'EOF'
import pandas as pd
import subprocess, sys
# Compare each CSV matrix's content (not bytes) against the committed version.
# Parquet is intentionally excluded — pyarrow output is not byte-stable across
# environments even with identical input, so we trust CSV as the canonical
# text form and require parquet only to deserialize correctly.
mismatches = []
for csv in [
"classification/wide/class_code.csv",
"classification/wide/sector_code.csv",
"classification/wide/sub_sector_code.csv",
"classification/wide/chain_ecosystem.csv",
"classification/long/panel.csv",
]:
committed = subprocess.check_output(["git", "show", f"HEAD:{csv}"]).decode("utf-8")
with open(csv, encoding="utf-8") as f:
rebuilt = f.read()
if committed.strip() != rebuilt.strip():
mismatches.append(csv)
if mismatches:
print(f"Out of sync (rebuild + commit needed):")
for m in mismatches:
print(f" - {m}")
sys.exit(1)
print(f"All {5} CSV artifacts in sync with snapshot.csv.")
EOF