fix(census): resolve type errors and missing import in check_csv_parquet_match

jpvelez · cursoragent · jpvelez · commit 4d8eb48fbb7a · 2026-02-17T04:46:46.000Z
- Inline _parse_pums_partition_path (remove dependency on missing convert script)
- Use cast() for polars collect() return type (InProcessQuery | DataFrame)
- Replace .item() with .row(0)[0] for row count extraction

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/data/census/pums/check_csv_parquet_match.py b/data/census/pums/check_csv_parquet_match.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""Temporary script: verify CSV row counts and optionally values match Parquet.
+
+Compares the CSV tree (after unzip) to the parquet tree produced by
+convert_pums_csv_to_parquet. Reports per-partition row count mismatches and
+optionally does a value-level check (--values).
+
+Usage:
+    uv run python data/census/pums/check_csv_parquet_match.py --csv-dir csv --parquet-dir parquet
+    uv run python data/census/pums/check_csv_parquet_match.py --csv-dir csv --parquet-dir parquet --values
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import cast
+
+import polars as pl
+
+# Partition pattern: survey/year/record_type/state=XX (same as convert script)
+
+
+def _parse_pums_partition_path(part_dir: Path) -> tuple[str, int, str, str] | None:
+    """Parse survey/year/record_type/state=XX from a partition directory path."""
+    parts = part_dir.resolve().parts
+    if len(parts) < 4:
+        return None
+    state_part = parts[-1]
+    if not state_part.startswith("state="):
+        return None
+    state = state_part[6:]
+    record_type = parts[-2]
+    if record_type not in ("person", "housing"):
+        return None
+    try:
+        end_year = int(parts[-3])
+    except ValueError:
+        return None
+    survey = parts[-4]
+    if survey not in ("acs1", "acs5"):
+        return None
+    return (survey, end_year, record_type, state)
+
+
+def _row_count_csv(part_dir: Path) -> int:
+    """Total rows from all *.csv in partition dir (same as convert combines)."""
+    csv_files = list(part_dir.glob("*.csv"))
+    if not csv_files:
+        return 0
+    csv_glob = str(part_dir / "*.csv")
+    df = cast(pl.DataFrame, pl.scan_csv(csv_glob).select(pl.len()).collect())
+    return int(df.row(0)[0])
+
+
+def _row_count_parquet(parquet_file: Path) -> int:
+    """Row count of single data.parquet."""
+    if not parquet_file.exists():
+        return -1
+    df = cast(pl.DataFrame, pl.scan_parquet(parquet_file).select(pl.len()).collect())
+    return int(df.row(0)[0])
+
+
+def check_row_counts(csv_dir: Path, parquet_dir: Path) -> tuple[bool, int, int]:
+    """Compare row counts per partition. Return (all_ok, total_csv_rows, total_parquet_rows)."""
+    csv_dir = csv_dir.resolve()
+    parquet_dir = parquet_dir.resolve()
+    total_csv = 0
+    total_pq = 0
+    all_ok = True
+    for part_dir in csv_dir.rglob("*"):
+        if not part_dir.is_dir():
+            continue
+        parsed = _parse_pums_partition_path(part_dir)
+        if parsed is None:
+            continue
+        survey, end_year, record_type, state = parsed
+        csv_count = _row_count_csv(part_dir)
+        out_part = parquet_dir / survey / str(end_year) / record_type / f"state={state}"
+        pq_file = out_part / "data.parquet"
+        pq_count = _row_count_parquet(pq_file) if pq_file.exists() else -1
+        total_csv += csv_count
+        if pq_count >= 0:
+            total_pq += pq_count
+        else:
+            all_ok = False
+            print(
+                f"MISSING parquet: {out_part.relative_to(parquet_dir)} (CSV rows={csv_count})"
+            )
+        if pq_count >= 0 and pq_count != csv_count:
+            all_ok = False
+            print(
+                f"COUNT MISMATCH {part_dir.relative_to(csv_dir)}: CSV={csv_count} Parquet={pq_count}"
+            )
+    return all_ok, total_csv, total_pq
+
+
+def check_partition_values(part_dir: Path, pq_file: Path) -> bool:
+    """Read CSV and Parquet for one partition; compare after normalizing (lowercase, sort)."""
+    csv_glob = str(part_dir / "*.csv")
+    lf_csv = pl.scan_csv(csv_glob)
+    cols = [c.lower() for c in lf_csv.collect_schema().names()]
+    lf_csv = lf_csv.rename({c: c.lower() for c in lf_csv.collect_schema().names()})
+    df_csv = cast(pl.DataFrame, lf_csv.collect())
+    df_csv = df_csv.sort(cols)
+    df_pq = pl.read_parquet(pq_file)
+    if set(df_csv.columns) != set(df_pq.columns) or df_csv.height != df_pq.height:
+        return False
+    df_pq = df_pq.select(df_csv.columns).sort(cols)
+    # Cast to string so CSV-inferred types vs parquet data-dict types still match
+    a = df_csv.select(pl.all().cast(pl.Utf8))
+    b = df_pq.select(pl.all().cast(pl.Utf8))
+    return a.equals(b)
+
+
+def check_values(csv_dir: Path, parquet_dir: Path) -> bool:
+    """Run value-level check for every partition. Return True if all match."""
+    csv_dir = csv_dir.resolve()
+    parquet_dir = parquet_dir.resolve()
+    all_ok = True
+    for part_dir in csv_dir.rglob("*"):
+        if not part_dir.is_dir():
+            continue
+        parsed = _parse_pums_partition_path(part_dir)
+        if parsed is None:
+            continue
+        survey, end_year, record_type, state = parsed
+        out_part = parquet_dir / survey / str(end_year) / record_type / f"state={state}"
+        pq_file = out_part / "data.parquet"
+        if not pq_file.exists():
+            continue
+        if not check_partition_values(part_dir, pq_file):
+            all_ok = False
+            print(f"VALUE MISMATCH: {part_dir.relative_to(csv_dir)}")
+    return all_ok
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Verify CSV and Parquet row counts (and optionally values) match."
+    )
+    parser.add_argument(
+        "--csv-dir", type=Path, required=True, help="Root of CSV tree (e.g. csv/)"
+    )
+    parser.add_argument(
+        "--parquet-dir",
+        type=Path,
+        required=True,
+        help="Root of Parquet tree (e.g. parquet/)",
+    )
+    parser.add_argument(
+        "--values",
+        action="store_true",
+        help="Also compare values partition-by-partition (slower).",
+    )
+    args = parser.parse_args()
+
+    csv_dir = args.csv_dir.resolve()
+    parquet_dir = args.parquet_dir.resolve()
+    if not csv_dir.is_dir():
+        print(f"Error: CSV dir not found: {csv_dir}", file=sys.stderr)
+        return 1
+    if not parquet_dir.is_dir():
+        print(f"Error: Parquet dir not found: {parquet_dir}", file=sys.stderr)
+        return 1
+
+    ok, total_csv, total_pq = check_row_counts(csv_dir, parquet_dir)
+    print(f"Total CSV rows: {total_csv}")
+    print(f"Total Parquet rows: {total_pq}")
+    if not ok:
+        print("Row count check: FAILED")
+        return 1
+    print("Row count check: OK")
+
+    if args.values:
+        if not check_values(csv_dir, parquet_dir):
+            print("Value check: FAILED")
+            return 1
+        print("Value check: OK")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())