Add test suite for backwards compatibility (#999)

jtilly · web-flow · commit 6457a496e657 · 2026-05-05T11:53:07.000-04:00
* Add backwards compatibility tests.

* Save data to disk, save predictions as npy.
diff --git a/.github/workflows/backwards-compatibility.yml b/.github/workflows/backwards-compatibility.yml
@@ -0,0 +1,35 @@
+name: Backwards Compatibility
+
+on:
+  # We would like to trigger for CI for any pull request action -
+  # both from QuantCo's branches as well as forks.
+  pull_request:
+  # In addition to pull requests, we want to run CI for pushes
+  # to the main branch and tags.
+  push:
+    branches:
+      - "main"
+    tags:
+      - "*"
+  workflow_dispatch:
+
+jobs:
+  backwards-compatibility:
+    name: Backwards compatibility tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Set up pixi
+        uses: prefix-dev/setup-pixi@a0af7a228712d6121d37aba47adf55c1332c9c2e # v0.9.4
+        with:
+          environments: default
+          cache: true
+
+      - name: Install current glum
+        run: pixi run postinstall
+
+      - name: Run backwards compatibility tests
+        run: pixi run test-backwards-compatibility
diff --git a/pixi.toml b/pixi.toml
@@ -10,6 +10,7 @@ postinstall = "pip install --no-build-isolation --no-deps --disable-pip-version-
 store-benchmark-golden-master = { cmd = "python tests/glm/test_benchmark_golden_master.py", env = { PYTHONPATH = "." } }
 store-golden-master = { cmd = "python tests/glm/test_golden_master.py", env = { PYTHONPATH = "." } }
 test = { cmd = "pytest tests/glm --doctest-modules src/glum", env = { PYTHONPATH = "." } }
+test-backwards-compatibility = { cmd = "python tests/backwards_compatibility/run.py" }
 
 [feature.docs.tasks]
 make-docs = "cd docs && make html"
diff --git a/tests/backwards_compatibility/.gitignore b/tests/backwards_compatibility/.gitignore
@@ -0,0 +1 @@
+artifacts/
diff --git a/tests/backwards_compatibility/fit.py b/tests/backwards_compatibility/fit.py
@@ -0,0 +1,72 @@
+"""Fit a GLM with the installed glum version and save artifacts.
+
+Usage:
+  python fit.py X.Y.Z   # label artifacts with a release version
+  python fit.py HEAD     # label artifacts with HEAD (current repo version)
+
+Artifacts are written to:
+  tests/backwards_compatibility/artifacts/<version>/model.pkl
+  tests/backwards_compatibility/artifacts/<version>/predictions.npy
+
+NOTE: This script must work with glum >= 2.0.0. It deliberately avoids
+features added in 3.x: formula interface, Polars DataFrames, monotonic
+constraints, closed-form solver.
+"""
+
+import argparse
+import pickle
+from pathlib import Path
+
+import numpy as np
+
+from glum import GeneralizedLinearRegressor
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+ARTIFACTS_DIR = SCRIPT_DIR / "artifacts"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "version",
+        help="Artifact label: a release version string (e.g. 2.0.3) or HEAD",
+    )
+    args = parser.parse_args()
+
+    import glum
+
+    installed_version = glum.__version__
+
+    print(f"Installed glum version: {installed_version}")
+    print(f"Artifact label: {args.version}")
+
+    output_dir = ARTIFACTS_DIR / args.version
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    X = np.load(str(ARTIFACTS_DIR / "X.npy"))
+    y = np.load(str(ARTIFACTS_DIR / "y.npy"))
+
+    # All keyword args: glum 3.0 made all params keyword-only; kwargs work in 2.x too.
+    # alpha=1.0: explicit to avoid 2.x (default=1) vs 3.x (default=0) difference.
+    # solver="irls-cd": avoids the closed-form solver added in 3.2 which may produce
+    #   slightly different floating-point results against the iterative solver.
+    model = GeneralizedLinearRegressor(
+        family="normal",
+        alpha=1.0,
+        solver="irls-cd",
+    )
+    model.fit(X, y)
+
+    pickle_path = output_dir / "model.pkl"
+    with open(pickle_path, "wb") as f:
+        pickle.dump(model, f)
+    print(f"Saved model to {pickle_path}")
+
+    predictions = model.predict(X)
+    predictions_path = output_dir / "predictions.npy"
+    np.save(str(predictions_path), predictions)
+    print(f"Saved predictions to {predictions_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/backwards_compatibility/run.py b/tests/backwards_compatibility/run.py
@@ -0,0 +1,194 @@
+"""Backwards compatibility tests for glum.
+
+Usage: python tests/backwards_compatibility/run_all.py
+       (or via: pixi run test-backwards-compatibility)
+
+1. Fits the current (HEAD) glum to produce reference predictions.
+2. Queries conda-forge via `pixi search` to discover the latest patch release
+   for each minor version of glum.
+3. For each version, uses `pixi exec` to fit a model and save artifacts
+   (model.pkl + predictions.npy) under artifacts/<version>/.
+4. Unpickles each saved model using the current glum and verifies that
+   predictions match the HEAD reference.
+"""
+
+import json
+import pickle
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+from packaging.version import Version
+from sklearn.datasets import make_regression
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+ARTIFACTS_DIR = SCRIPT_DIR / "artifacts"
+
+SKIP_VERSIONS: set[str] = set()
+
+
+def write_dataset() -> None:
+    """Write the fixed dataset to disk so all fit.py invocations use identical data."""
+    X, y = make_regression(n_samples=500, n_features=5, noise=1.0, random_state=42)
+    ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
+    np.save(str(ARTIFACTS_DIR / "X.npy"), X)
+    np.save(str(ARTIFACTS_DIR / "y.npy"), y)
+
+
+def discover_versions() -> list[str]:
+    """Return the latest patch release for each minor version of glum on conda-forge."""
+    result = subprocess.run(
+        ["pixi", "search", "glum", "--json"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    data = json.loads(result.stdout)
+    platform = next(iter(data))
+    best: dict[tuple[int, int], str] = {}
+    for entry in data[platform]:
+        v = Version(entry["version"])
+        key = (v.major, v.minor)
+        if key not in best or v > Version(best[key]):
+            best[key] = entry["version"]
+    return sorted(best.values(), key=Version)
+
+
+def fit_version(version: str) -> bool:
+    """Run fit.py for the given version and return True on success.
+
+    Uses ``pixi run`` for HEAD and ``pixi exec`` for released versions.
+    """
+    if version == "HEAD":
+        cmd = ["pixi", "run", "python", str(SCRIPT_DIR / "fit.py"), "HEAD"]
+    else:
+        v = Version(version)
+        cmd = ["pixi", "exec", f"--spec=glum=={version}"]
+        # glum <=2.6.0 imports pkg_resources from setuptools, which was removed
+        # in setuptools 82. Pin setuptools<82 for those old versions.
+        if v <= Version("2.6.0"):
+            cmd += ["--spec=setuptools<82"]
+        # glum <=2.3.0: sklearn 1.3 added const qualifiers to _cython_blas
+        # function pointers, breaking the Cython ABI of older glum builds.
+        if v <= Version("2.3.0"):
+            cmd += ["--spec=scikit-learn<1.3"]
+        # glum 3.0.x: sklearn 1.6 removed BaseEstimator._validate_data.
+        elif v < Version("3.1.0"):
+            cmd += ["--spec=scikit-learn<1.6"]
+        cmd += ["python", str(SCRIPT_DIR / "fit.py"), version]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(result.stdout, end="")
+        print(result.stderr, end="", file=sys.stderr)
+    return result.returncode == 0
+
+
+def compare_versions(versions: list[str]) -> bool:
+    """Unpickle each version's model and verify its predictions match HEAD.
+
+    Also checks that predictions match the array stored by fit.py to confirm
+    the pickle round-trip is stable. Returns True if all versions pass.
+    """
+    version_dirs = [ARTIFACTS_DIR / v for v in versions if (ARTIFACTS_DIR / v).is_dir()]
+
+    if not version_dirs:
+        print("ERROR: No artifact directories found. Did fit step produce any output?")
+        return False
+
+    X = np.load(str(ARTIFACTS_DIR / "X.npy"))
+    head_predictions = np.load(str(ARTIFACTS_DIR / "HEAD" / "predictions.npy"))
+
+    import glum
+
+    current_version = glum.__version__
+    print(f"Current glum version: {current_version}")
+    print(f"Testing {len(version_dirs)} version(s): {[d.name for d in version_dirs]}\n")
+
+    failures = []
+
+    for version_dir in version_dirs:
+        version = version_dir.name
+        pickle_path = version_dir / "model.pkl"
+        predictions_path = version_dir / "predictions.npy"
+
+        try:
+            with open(pickle_path, "rb") as f:
+                old_model = pickle.load(f)
+        except Exception as e:
+            failures.append(f"{version}: unpickling failed: {e}")
+            continue
+
+        try:
+            old_predictions = old_model.predict(X)
+        except Exception as e:
+            failures.append(f"{version}: predict() failed after unpickling: {e}")
+            continue
+
+        stored_predictions = np.load(str(predictions_path))
+
+        try:
+            np.testing.assert_allclose(
+                old_predictions,
+                stored_predictions,
+                rtol=1e-5,
+                err_msg=f"[{version}] Unpickled predictions do not match stored array",
+            )
+            print(f"[{version}] PASS: unpickled predictions match stored predictions")
+        except AssertionError as e:
+            failures.append(str(e))
+
+        try:
+            np.testing.assert_allclose(
+                old_predictions,
+                head_predictions,
+                rtol=1e-5,
+                err_msg=f"[{version}] Predictions from old model do not match HEAD",
+            )
+            print(f"[{version}] PASS: old model predictions match HEAD")
+        except AssertionError as e:
+            failures.append(str(e))
+
+    print()
+    if failures:
+        print("FAILURES:")
+        for msg in failures:
+            print(f"  - {msg}")
+        return False
+
+    print(f"All {len(version_dirs)} version(s) passed.")
+    return True
+
+
+def main() -> None:
+    """Fit HEAD and all released minor versions, then compare predictions."""
+    write_dataset()
+
+    print("=== Fitting HEAD ===")
+    if not fit_version("HEAD"):
+        print("ERROR: Failed to fit HEAD model.")
+        sys.exit(1)
+
+    print("\n=== Discovering glum versions from conda-forge ===")
+    versions = discover_versions()
+    print(f"Found {len(versions)} minor release(s): {' '.join(versions)}")
+
+    print("\n=== Generating compatibility artifacts ===")
+    fitted_versions = []
+    for version in versions:
+        if version in SKIP_VERSIONS:
+            print(f"--- Skipping glum=={version} (known incompatibility) ---")
+            continue
+        print(f"--- Fitting glum=={version} ---")
+        if fit_version(version):
+            fitted_versions.append(version)
+        else:
+            print(f"WARNING: glum=={version} failed, skipping.")
+
+    print("\n=== Comparing against HEAD ===")
+    if not compare_versions(fitted_versions):
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()