diff --git a/causalpy/checks/convex_hull.py b/causalpy/checks/convex_hull.py index 8a7e2349a..3222ca9b9 100644 --- a/causalpy/checks/convex_hull.py +++ b/causalpy/checks/convex_hull.py @@ -50,8 +50,8 @@ def run( ) -> CheckResult: """Run the convex hull violation check on pre-treatment data.""" sc = experiment - datapre_control = sc.datapre_control # type: ignore[attr-defined] - datapre_treated = sc.datapre_treated # type: ignore[attr-defined] + datapre_control = sc.pre_design["control"] # type: ignore[attr-defined] + datapre_treated = sc.pre_design["treated"] # type: ignore[attr-defined] all_results = [] total_violations = 0 diff --git a/causalpy/experiments/base.py b/causalpy/experiments/base.py index a88c425e5..06c5f76a1 100644 --- a/causalpy/experiments/base.py +++ b/causalpy/experiments/base.py @@ -18,13 +18,16 @@ from __future__ import annotations import contextlib +import warnings from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Literal import arviz as az import matplotlib.pyplot as plt +import numpy as np import pandas as pd +import xarray as xr from sklearn.base import RegressorMixin from causalpy.maketables_adapters import get_maketables_adapter @@ -114,6 +117,67 @@ class BaseExperiment(ABC): _default_model_class: type[PyMCModel] | None = None + _deprecated_design_aliases: dict[str, tuple[str, str]] = {} + """Mapping of ``old_attr -> (dataset_attr, key)`` for deprecated design + matrix accessors. Subclasses populate this so that + ``__getattr__`` can forward accesses with a deprecation warning.""" + + def __getattr__(self, name: str) -> Any: + aliases = type(self)._deprecated_design_aliases + if name in aliases: + dataset_attr, key = aliases[name] + warnings.warn( + f"{name} is deprecated, use {dataset_attr}['{key}']", + DeprecationWarning, + stacklevel=2, + ) + return getattr(self, dataset_attr)[key] + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + @staticmethod + def _build_design_dataset( + X_raw: np.ndarray, + y_raw: np.ndarray, + *, + obs_ind: np.ndarray | pd.Index, + coeffs: list[str], + treated_units: list[str] | None = None, + ) -> xr.Dataset: + """Build a standard ``xr.Dataset`` from raw design matrices. + + Parameters + ---------- + X_raw : np.ndarray + Predictor matrix, shape ``(n_obs, n_coeffs)``. + y_raw : np.ndarray + Outcome matrix, shape ``(n_obs, n_units)``. + obs_ind : array-like + Observation index coordinates. + coeffs : list[str] + Coefficient / column names for ``X_raw``. + treated_units : list[str], optional + Names for the treated-unit dimension of ``y_raw``. + Defaults to ``["unit_0"]``. + """ + if treated_units is None: + treated_units = ["unit_0"] + return xr.Dataset( + { + "X": xr.DataArray( + X_raw, + dims=["obs_ind", "coeffs"], + coords={"obs_ind": obs_ind, "coeffs": coeffs}, + ), + "y": xr.DataArray( + y_raw, + dims=["obs_ind", "treated_units"], + coords={"obs_ind": obs_ind, "treated_units": treated_units}, + ), + } + ) + def __init__(self, model: PyMCModel | RegressorMixin | None = None) -> None: # Ensure we've made any provided Scikit Learn model (as identified as being type # RegressorMixin) compatible with CausalPy by appending our custom methods. diff --git a/causalpy/experiments/diff_in_diff.py b/causalpy/experiments/diff_in_diff.py index cd23a0ae1..150c40e05 100644 --- a/causalpy/experiments/diff_in_diff.py +++ b/causalpy/experiments/diff_in_diff.py @@ -98,6 +98,7 @@ class DifferenceInDifferences(BaseExperiment): supports_ols = True supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -130,42 +131,36 @@ def _build_design_matrices(self) -> None: self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) + self._y_raw, self._X_raw = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays.""" - self.X = xr.DataArray( - self.X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(self.X.shape[0]), - "coeffs": self.labels, - }, - ) - self.y = xr.DataArray( - self.y, - dims=["obs_ind", "treated_units"], - coords={"obs_ind": np.arange(self.y.shape[0]), "treated_units": ["unit_0"]}, + """Bundle design matrices into an ``xr.Dataset``.""" + n = self._X_raw.shape[0] + self.design = self._build_design_dataset( + self._X_raw, + self._y_raw, + obs_ind=np.arange(n), + coeffs=self.labels, ) + del self._X_raw, self._y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit model, predict, and calculate causal impact.""" - # fit model + X = self.design["X"] + y = self.design["y"] + if isinstance(self.model, PyMCModel): COORDS = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) + self.model.fit(X=X, y=y, coords=COORDS) elif isinstance(self.model, RegressorMixin): - # Ensure the intercept is part of the coefficients array rather than - # a separate intercept_ attribute. See #664 / PR #693 for - # centralising this in BaseExperiment. if hasattr(self.model, "fit_intercept"): self.model.fit_intercept = False - self.model.fit(X=self.X, y=self.y) + self.model.fit(X=X, y=y) else: raise ValueError("Model type not recognized") diff --git a/causalpy/experiments/interrupted_time_series.py b/causalpy/experiments/interrupted_time_series.py index a11d23fda..eb0823862 100644 --- a/causalpy/experiments/interrupted_time_series.py +++ b/causalpy/experiments/interrupted_time_series.py @@ -128,6 +128,12 @@ class InterruptedTimeSeries(BaseExperiment): supports_ols = True supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = { + "pre_X": ("pre_design", "X"), + "pre_y": ("pre_design", "y"), + "post_X": ("post_design", "X"), + "post_y": ("post_design", "y"), + } def __init__( self, @@ -139,9 +145,8 @@ def __init__( **kwargs: Any, ) -> None: super().__init__(model=model) - self.pre_y: xr.DataArray - self.post_y: xr.DataArray - # rename the index to "obs_ind" + self.pre_design: xr.Dataset + self.post_design: xr.Dataset data.index.name = "obs_ind" self.data = data self.input_validation(data, treatment_time, treatment_end_time) @@ -155,96 +160,76 @@ def __init__( def _build_design_matrices(self) -> None: """Build design matrices for pre and post intervention periods using patsy.""" - # set things up with pre-intervention data y, X = dmatrices(self.formula, self.datapre) self.outcome_variable_name = y.design_info.column_names[0] self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.pre_y, self.pre_X = np.asarray(y), np.asarray(X) - # process post-intervention data + self._pre_y_raw, self._pre_X_raw = np.asarray(y), np.asarray(X) (new_y, new_x) = build_design_matrices( [self._y_design_info, self._x_design_info], self.datapost ) - self.post_X = np.asarray(new_x) - self.post_y = np.asarray(new_y) + self._post_X_raw = np.asarray(new_x) + self._post_y_raw = np.asarray(new_y) def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays for pre and post periods.""" - self.pre_X = xr.DataArray( - self.pre_X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": self.datapre.index, - "coeffs": self.labels, - }, - ) - self.pre_y = xr.DataArray( - self.pre_y, # Keep 2D shape - dims=["obs_ind", "treated_units"], - coords={"obs_ind": self.datapre.index, "treated_units": ["unit_0"]}, - ) - self.post_X = xr.DataArray( - self.post_X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": self.datapost.index, - "coeffs": self.labels, - }, + """Bundle design matrices into ``xr.Dataset`` objects for pre and post periods.""" + self.pre_design = self._build_design_dataset( + self._pre_X_raw, + self._pre_y_raw, + obs_ind=self.datapre.index, + coeffs=self.labels, ) - self.post_y = xr.DataArray( - self.post_y, # Keep 2D shape - dims=["obs_ind", "treated_units"], - coords={"obs_ind": self.datapost.index, "treated_units": ["unit_0"]}, + self.post_design = self._build_design_dataset( + self._post_X_raw, + self._post_y_raw, + obs_ind=self.datapost.index, + coeffs=self.labels, ) + del self._pre_X_raw, self._pre_y_raw, self._post_X_raw, self._post_y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit model, predict, and calculate causal impact.""" - # fit the model to the observed (pre-intervention) data - # All PyMC models now accept xr.DataArray with consistent API + pre_X = self.pre_design["X"] + pre_y = self.pre_design["y"] + post_X = self.post_design["X"] + post_y = self.post_design["y"] + if isinstance(self.model, PyMCModel): COORDS: dict[str, Any] = { "coeffs": self.labels, - "obs_ind": np.arange(self.pre_X.shape[0]), + "obs_ind": np.arange(pre_X.shape[0]), "treated_units": ["unit_0"], - "datetime_index": self.datapre.index, # For time series models + "datetime_index": self.datapre.index, } - self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS) + self.model.fit(X=pre_X, y=pre_y, coords=COORDS) elif isinstance(self.model, RegressorMixin): - # For OLS models, use 1D y data - self.model.fit(X=self.pre_X, y=self.pre_y.isel(treated_units=0)) + self.model.fit(X=pre_X, y=pre_y.isel(treated_units=0)) else: raise ValueError("Model type not recognized") - # score the goodness of fit to the pre-intervention data if isinstance(self.model, PyMCModel): - self.score = self.model.score(X=self.pre_X, y=self.pre_y) + self.score = self.model.score(X=pre_X, y=pre_y) elif isinstance(self.model, RegressorMixin): - self.score = self.model.score( - X=self.pre_X, y=self.pre_y.isel(treated_units=0) - ) + self.score = self.model.score(X=pre_X, y=pre_y.isel(treated_units=0)) - # get the model predictions of the observed (pre-intervention) data if isinstance(self.model, PyMCModel | RegressorMixin): - self.pre_pred = self.model.predict(X=self.pre_X) + self.pre_pred = self.model.predict(X=pre_X) - # calculate the counterfactual (post period) if isinstance(self.model, PyMCModel): - self.post_pred = self.model.predict(X=self.post_X, out_of_sample=True) + self.post_pred = self.model.predict(X=post_X, out_of_sample=True) elif isinstance(self.model, RegressorMixin): - self.post_pred = self.model.predict(X=self.post_X) + self.post_pred = self.model.predict(X=post_X) - # calculate impact - all PyMC models now use 2D data with treated_units if isinstance(self.model, PyMCModel): - self.pre_impact = self.model.calculate_impact(self.pre_y, self.pre_pred) - self.post_impact = self.model.calculate_impact(self.post_y, self.post_pred) + self.pre_impact = self.model.calculate_impact(pre_y, self.pre_pred) + self.post_impact = self.model.calculate_impact(post_y, self.post_pred) elif isinstance(self.model, RegressorMixin): - # SKL models work with 1D data self.pre_impact = self.model.calculate_impact( - self.pre_y.isel(treated_units=0), self.pre_pred + pre_y.isel(treated_units=0), self.pre_pred ) self.post_impact = self.model.calculate_impact( - self.post_y.isel(treated_units=0), self.post_pred + post_y.isel(treated_units=0), self.post_pred ) self.post_impact_cumulative = self.model.calculate_cumulative_impact( @@ -625,9 +610,7 @@ def _bayesian_plot( (h,) = ax[0].plot( self.datapre.index, - self.pre_y.isel(treated_units=0) - if hasattr(self.pre_y, "isel") - else self.pre_y[:, 0], + self.pre_design["y"].isel(treated_units=0), "k.", label="Observations", ) @@ -652,9 +635,7 @@ def _bayesian_plot( ax[0].plot( self.datapost.index, - self.post_y.isel(treated_units=0) - if hasattr(self.post_y, "isel") - else self.post_y[:, 0], + self.post_design["y"].isel(treated_units=0), "k.", ) # Shaded causal effect @@ -667,9 +648,7 @@ def _bayesian_plot( h = ax[0].fill_between( self.datapost.index, y1=post_pred_mu, - y2=self.post_y.isel(treated_units=0) - if hasattr(self.post_y, "isel") - else self.post_y[:, 0], + y2=self.post_design["y"].isel(treated_units=0), color="C0", alpha=0.25, ) @@ -805,10 +784,10 @@ def _ols_plot( fig, ax = plt.subplots(3, 1, sharex=True, figsize=(7, 8)) - ax[0].plot(self.datapre.index, self.pre_y, "k.") + ax[0].plot(self.datapre.index, self.pre_design["y"], "k.") ax[0].plot(self.datapre.index, self.pre_pred, c="k", label="model fit") - ax[0].plot(self.datapost.index, self.post_y, "k.") + ax[0].plot(self.datapost.index, self.post_design["y"], "k.") ax[0].plot( self.datapost.index, self.post_pred, @@ -820,7 +799,7 @@ def _ols_plot( ax[0].fill_between( self.datapost.index, y1=np.squeeze(self.post_pred), - y2=np.squeeze(self.post_y), + y2=np.squeeze(self.post_design["y"]), color="C0", alpha=0.25, label="Causal impact", diff --git a/causalpy/experiments/panel_regression.py b/causalpy/experiments/panel_regression.py index d8f14f54f..c38f2c6d1 100644 --- a/causalpy/experiments/panel_regression.py +++ b/causalpy/experiments/panel_regression.py @@ -21,7 +21,6 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -import xarray as xr from matplotlib.gridspec import GridSpec from patsy import dmatrices from scipy import stats @@ -178,6 +177,7 @@ class PanelRegression(BaseExperiment): supports_ols = True supports_bayes = True + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -277,40 +277,35 @@ def _build_design_matrices(self) -> None: self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) + self._y_raw, self._X_raw = np.asarray(y), np.asarray(X) def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays.""" - self.X = xr.DataArray( # type: ignore[assignment] - self.X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(self.X.shape[0]), - "coeffs": self.labels, - }, - ) - self.y = xr.DataArray( # type: ignore[assignment] - self.y, - dims=["obs_ind", "treated_units"], - coords={"obs_ind": np.arange(self.y.shape[0]), "treated_units": ["unit_0"]}, + """Bundle design matrices into an ``xr.Dataset``.""" + n = self._X_raw.shape[0] + self.design = self._build_design_dataset( + self._X_raw, + self._y_raw, + obs_ind=np.arange(n), + coeffs=self.labels, ) + del self._X_raw, self._y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit the model.""" + X = self.design["X"] + y = self.design["y"] + if isinstance(self.model, PyMCModel): COORDS = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) # type: ignore[arg-type] + self.model.fit(X=X, y=y, coords=COORDS) elif isinstance(self.model, RegressorMixin): - # For scikit-learn models, set fit_intercept=False so that the - # patsy intercept column is included in the coefficients array. - # TODO: later, this should be handled in ScikitLearnAdaptor itself if hasattr(self.model, "fit_intercept"): self.model.fit_intercept = False - self.model.fit(X=self.X, y=self.y) + self.model.fit(X=X, y=y) def _demean_transform(self, data: pd.DataFrame, group_var: str) -> pd.DataFrame: """Apply demeaned transformation (demean by group). @@ -414,7 +409,7 @@ def summary(self, round_to: int | None = None) -> None: if self.n_periods: print(f"Periods: {self.n_periods} ({self.time_fe_variable})") print(f"FE method: {self.fe_method}") - print(f"Observations: {self.X.shape[0]}") + print(f"Observations: {self.design['X'].shape[0]}") print("=" * 60) coeff_labels = self._get_non_fe_labels() @@ -566,7 +561,7 @@ def get_plot_data_bayesian(self, **kwargs: Any) -> pd.DataFrame: plot_data = pd.DataFrame( { - "y_actual": self.y.values.flatten(), # type: ignore[attr-defined] + "y_actual": self.design["y"].values.flatten(), "y_fitted": pred_mean, "y_fitted_lower": pred_lower, "y_fitted_upper": pred_upper, @@ -588,13 +583,13 @@ def get_plot_data_ols(self, **kwargs: Any) -> pd.DataFrame: DataFrame with fitted values """ if isinstance(self.model, RegressorMixin): - y_fitted = np.squeeze(self.model.predict(self.X)) # type: ignore[attr-defined] + y_fitted = np.squeeze(self.model.predict(self.design["X"])) else: raise ValueError("Model is not an OLS model") plot_data = pd.DataFrame( { - "y_actual": self.y.values.flatten(), # type: ignore[attr-defined] + "y_actual": self.design["y"].values.flatten(), "y_fitted": y_fitted, self.unit_fe_variable: self.data[self.unit_fe_variable].values, } @@ -837,7 +832,7 @@ def plot_trajectories( sorted_obs_indices = unit_obs_indices[sort_order] # Get actual y values (sorted by time) - y_actual = self.y.values.flatten()[sorted_obs_indices] # type: ignore[attr-defined] + y_actual = self.design["y"].values.flatten()[sorted_obs_indices] # Plot actual ax.plot( @@ -880,7 +875,9 @@ def plot_trajectories( ) else: # OLS: get fitted values for this unit - y_fitted = np.squeeze(self.model.predict(self.X))[sorted_obs_indices] # type: ignore[union-attr, attr-defined] + y_fitted = np.squeeze(self.model.predict(self.design["X"]))[ + sorted_obs_indices + ] # type: ignore[union-attr] ax.plot( sorted_time_vals, y_fitted, diff --git a/causalpy/experiments/piecewise_its.py b/causalpy/experiments/piecewise_its.py index 929650066..2124fb0d5 100644 --- a/causalpy/experiments/piecewise_its.py +++ b/causalpy/experiments/piecewise_its.py @@ -146,6 +146,7 @@ class PiecewiseITS(BaseExperiment): supports_ols = True supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -187,51 +188,40 @@ def __init__( n_obs = X_array.shape[0] - # Convert to xarray DataArrays - self.X = xr.DataArray( + # Bundle into xr.Dataset + self.design = self._build_design_dataset( X_array, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(n_obs), - "coeffs": self.labels, - }, - ) - - self.y = xr.DataArray( y_array, - dims=["obs_ind", "treated_units"], - coords={ - "obs_ind": np.arange(n_obs), - "treated_units": ["unit_0"], - }, + obs_ind=np.arange(n_obs), + coeffs=self.labels, ) # Track which columns are interruption-related (for counterfactual) self._interruption_cols = self._get_interruption_column_indices() - # Fit the model to the full time series + X = self.design["X"] + y = self.design["y"] + if isinstance(self.model, PyMCModel): COORDS: dict[str, Any] = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) + self.model.fit(X=X, y=y, coords=COORDS) elif isinstance(self.model, RegressorMixin): if hasattr(self.model, "fit_intercept"): self.model.fit_intercept = False - self.model.fit(X=self.X, y=self.y.isel(treated_units=0)) + self.model.fit(X=X, y=y.isel(treated_units=0)) else: raise ValueError("Model type not recognized") - # Compute predictions (fitted values) - self.y_pred = self.model.predict(X=self.X) + self.y_pred = self.model.predict(X=X) - # Score the model fit if isinstance(self.model, PyMCModel): - self.score = self.model.score(X=self.X, y=self.y) + self.score = self.model.score(X=X, y=y) elif isinstance(self.model, RegressorMixin): - self.score = self.model.score(X=self.X, y=self.y.isel(treated_units=0)) + self.score = self.model.score(X=X, y=y.isel(treated_units=0)) # Compute counterfactual and effects self._compute_counterfactual_and_effects() @@ -333,7 +323,7 @@ def _compute_counterfactual_and_effects(self) -> None: compatibility with effect_summary() from BaseExperiment. """ # Create design matrix for counterfactual (zero out interruption columns) - X_cf = self.X.copy() + X_cf = self.design["X"].copy() for idx in self._interruption_cols: X_cf[:, idx] = 0 @@ -470,7 +460,7 @@ def _bayesian_plot( # Observed data (h_obs,) = ax[0].plot( time_values, - self.y.isel(treated_units=0), + self.design["y"].isel(treated_units=0), "k.", label="Observations", ) @@ -584,7 +574,7 @@ def _ols_plot( fig, ax = plt.subplots(3, 1, sharex=True, figsize=(10, 10)) # TOP PLOT: Observed, Fitted, and Counterfactual - ax[0].plot(time_values, self.y.values, "k.", label="Observations") + ax[0].plot(time_values, self.design["y"].values, "k.", label="Observations") ax[0].plot(time_values, self.y_pred, "C0-", label="Fitted", linewidth=2) ax[0].plot( time_values, @@ -685,7 +675,9 @@ def _get_hdi_bounds( result = pd.DataFrame( { self.time_col: time_values, - self.outcome_variable_name: self.y.isel(treated_units=0).values, + self.outcome_variable_name: self.design["y"] + .isel(treated_units=0) + .values, "fitted": fitted_mean, f"fitted_hdi_lower_{hdi_pct}": fitted_lower, f"fitted_hdi_upper_{hdi_pct}": fitted_upper, @@ -718,7 +710,7 @@ def get_plot_data_ols(self) -> pd.DataFrame: result = pd.DataFrame( { self.time_col: time_values, - self.outcome_variable_name: self.y.values.flatten(), + self.outcome_variable_name: self.design["y"].values.flatten(), "fitted": np.squeeze(self.y_pred), "counterfactual": np.squeeze(self.y_counterfactual), "effect": self.effect, diff --git a/causalpy/experiments/prepostnegd.py b/causalpy/experiments/prepostnegd.py index 07b8492dc..9ec3fd122 100644 --- a/causalpy/experiments/prepostnegd.py +++ b/causalpy/experiments/prepostnegd.py @@ -88,6 +88,7 @@ class PrePostNEGD(BaseExperiment): supports_ols = False supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -119,35 +120,31 @@ def _build_design_matrices(self) -> None: self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) + self._y_raw, self._X_raw = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays.""" - self.X = xr.DataArray( - self.X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(self.X.shape[0]), - "coeffs": self.labels, - }, - ) - self.y = xr.DataArray( - self.y, - dims=["obs_ind", "treated_units"], - coords={"obs_ind": self.data.index, "treated_units": ["unit_0"]}, + """Bundle design matrices into an ``xr.Dataset``.""" + self.design = self._build_design_dataset( + self._X_raw, + self._y_raw, + obs_ind=self.data.index, + coeffs=self.labels, ) + del self._X_raw, self._y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit model, predict, and calculate causal impact.""" - # fit the model to the observed (pre-intervention) data + X = self.design["X"] + y = self.design["y"] + if isinstance(self.model, PyMCModel): COORDS = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) + self.model.fit(X=X, y=y, coords=COORDS) elif isinstance(self.model, RegressorMixin): raise NotImplementedError("Not implemented for OLS model") else: diff --git a/causalpy/experiments/regression_discontinuity.py b/causalpy/experiments/regression_discontinuity.py index 5fdbdcf65..de2507c6b 100644 --- a/causalpy/experiments/regression_discontinuity.py +++ b/causalpy/experiments/regression_discontinuity.py @@ -24,7 +24,6 @@ from matplotlib import pyplot as plt from patsy import build_design_matrices, dmatrices from sklearn.base import RegressorMixin -import xarray as xr from causalpy.custom_exceptions import ( DataException, FormulaException, @@ -92,6 +91,7 @@ class RegressionDiscontinuity(BaseExperiment): supports_ols = True supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -154,43 +154,38 @@ def _build_design_matrices(self) -> None: self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) + self._y_raw, self._X_raw = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays.""" - self.X = xr.DataArray( - self.X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(self.X.shape[0]), - "coeffs": self.labels, - }, - ) - self.y = xr.DataArray( - self.y, - dims=["obs_ind", "treated_units"], - coords={"obs_ind": np.arange(self.y.shape[0]), "treated_units": ["unit_0"]}, + """Bundle design matrices into an ``xr.Dataset``.""" + n = self._X_raw.shape[0] + self.design = self._build_design_dataset( + self._X_raw, + self._y_raw, + obs_ind=np.arange(n), + coeffs=self.labels, ) + del self._X_raw, self._y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit model, predict, and calculate discontinuity.""" - # fit model + X = self.design["X"] + y = self.design["y"] + if isinstance(self.model, PyMCModel): - # fit the model to the observed (pre-intervention) data COORDS = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) + self.model.fit(X=X, y=y, coords=COORDS) elif isinstance(self.model, RegressorMixin): - self.model.fit(X=self.X, y=self.y) + self.model.fit(X=X, y=y) else: raise ValueError("Model type not recognized") - # score the goodness of fit to all data - self.score = self.model.score(X=self.X, y=self.y) + self.score = self.model.score(X=X, y=y) # get the model predictions of the observed data if self.bandwidth is not np.inf: diff --git a/causalpy/experiments/regression_kink.py b/causalpy/experiments/regression_kink.py index 5faf60999..0c872e471 100644 --- a/causalpy/experiments/regression_kink.py +++ b/causalpy/experiments/regression_kink.py @@ -64,6 +64,7 @@ class RegressionKink(BaseExperiment): supports_ols = False supports_bayes = True _default_model_class = LinearRegression + _deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")} def __init__( self, @@ -108,36 +109,33 @@ def _build_design_matrices(self) -> None: self._y_design_info = y.design_info self._x_design_info = X.design_info self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) + self._y_raw, self._X_raw = np.asarray(y), np.asarray(X) self.outcome_variable_name = y.design_info.column_names[0] def _prepare_data(self) -> None: - """Convert design matrices to xarray DataArrays.""" - self.X = xr.DataArray( - self.X, - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": np.arange(self.X.shape[0]), - "coeffs": self.labels, - }, - ) - self.y = xr.DataArray( - self.y, - dims=["obs_ind", "treated_units"], - coords={"obs_ind": np.arange(self.y.shape[0]), "treated_units": ["unit_0"]}, + """Bundle design matrices into an ``xr.Dataset``.""" + n = self._X_raw.shape[0] + self.design = self._build_design_dataset( + self._X_raw, + self._y_raw, + obs_ind=np.arange(n), + coeffs=self.labels, ) + del self._X_raw, self._y_raw def algorithm(self) -> None: """Run the experiment algorithm: fit model, predict, and evaluate gradient change.""" + X = self.design["X"] + y = self.design["y"] + COORDS = { "coeffs": self.labels, - "obs_ind": np.arange(self.X.shape[0]), + "obs_ind": np.arange(X.shape[0]), "treated_units": ["unit_0"], } - self.model.fit(X=self.X, y=self.y, coords=COORDS) + self.model.fit(X=X, y=y, coords=COORDS) - # score the goodness of fit to all data - self.score = self.model.score(X=self.X, y=self.y) + self.score = self.model.score(X=X, y=y) # get the model predictions of the observed data if self.bandwidth is not np.inf: diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py index c65a70514..60994d163 100644 --- a/causalpy/experiments/synthetic_control.py +++ b/causalpy/experiments/synthetic_control.py @@ -87,6 +87,12 @@ class SyntheticControl(BaseExperiment): supports_ols = True supports_bayes = True _default_model_class = WeightedSumFitter + _deprecated_design_aliases = { + "datapre_control": ("pre_design", "control"), + "datapre_treated": ("pre_design", "treated"), + "datapost_control": ("post_design", "control"), + "datapost_treated": ("post_design", "treated"), + } def __init__( self, @@ -126,12 +132,12 @@ def _check_convex_hull(self) -> None: total_above = 0 total_below = 0 n_units = len(self.treated_units) - n_pre_points = self.datapre_treated.shape[0] + n_pre_points = self.pre_design["treated"].shape[0] for i in range(n_units): unit_check = check_convex_hull_violation( - self.datapre_treated.isel(treated_units=i).values, - self.datapre_control.values, + self.pre_design["treated"].isel(treated_units=i).values, + self.pre_design["control"].values, ) total_violations += unit_check["n_violations"] total_above += unit_check["pct_above"] * n_pre_points / 100 @@ -222,42 +228,46 @@ def datapost(self) -> pd.DataFrame: return self.data[self.data.index >= self.treatment_time] def _prepare_data(self) -> None: - """Prepare xarray DataArrays for control and treated units in pre/post periods.""" - # split data into the 4 quadrants (pre/post, control/treated) and store as - # xarray.DataArray objects. - # NOTE: if we have renamed/ensured the index is named "obs_ind", then it will - # make constructing the xarray DataArray objects easier. - self.datapre_control = xr.DataArray( - self.datapre[self.control_units], - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": self.datapre[self.control_units].index, - "coeffs": self.control_units, - }, - ) - self.datapre_treated = xr.DataArray( - self.datapre[self.treated_units], - dims=["obs_ind", "treated_units"], - coords={ - "obs_ind": self.datapre[self.treated_units].index, - "treated_units": self.treated_units, - }, - ) - self.datapost_control = xr.DataArray( - self.datapost[self.control_units], - dims=["obs_ind", "coeffs"], - coords={ - "obs_ind": self.datapost[self.control_units].index, - "coeffs": self.control_units, - }, + """Bundle control and treated data into ``xr.Dataset`` objects per period.""" + self.pre_design = xr.Dataset( + { + "control": xr.DataArray( + self.datapre[self.control_units], + dims=["obs_ind", "coeffs"], + coords={ + "obs_ind": self.datapre[self.control_units].index, + "coeffs": self.control_units, + }, + ), + "treated": xr.DataArray( + self.datapre[self.treated_units], + dims=["obs_ind", "treated_units"], + coords={ + "obs_ind": self.datapre[self.treated_units].index, + "treated_units": self.treated_units, + }, + ), + } ) - self.datapost_treated = xr.DataArray( - self.datapost[self.treated_units], - dims=["obs_ind", "treated_units"], - coords={ - "obs_ind": self.datapost[self.treated_units].index, - "treated_units": self.treated_units, - }, + self.post_design = xr.Dataset( + { + "control": xr.DataArray( + self.datapost[self.control_units], + dims=["obs_ind", "coeffs"], + coords={ + "obs_ind": self.datapost[self.control_units].index, + "coeffs": self.control_units, + }, + ), + "treated": xr.DataArray( + self.datapost[self.treated_units], + dims=["obs_ind", "treated_units"], + coords={ + "obs_ind": self.datapost[self.treated_units].index, + "treated_units": self.treated_units, + }, + ), + } ) def algorithm(self) -> None: @@ -273,35 +283,35 @@ def algorithm(self) -> None: "obs_ind": np.arange(self.datapre.shape[0]), } self.model.fit( - X=self.datapre_control, - y=self.datapre_treated, + X=self.pre_design["control"], + y=self.pre_design["treated"], coords=COORDS, ) elif isinstance(self.model, RegressorMixin): self.model.fit( - X=self.datapre_control.data, - y=self.datapre_treated.isel(treated_units=0).data, + X=self.pre_design["control"].data, + y=self.pre_design["treated"].isel(treated_units=0).data, ) else: raise ValueError("Model type not recognized") # score the goodness of fit to the pre-intervention data self.score = self.model.score( - X=self.datapre_control, - y=self.datapre_treated, + X=self.pre_design["control"], + y=self.pre_design["treated"], ) # get the model predictions of the observed (pre-intervention) data - self.pre_pred = self.model.predict(X=self.datapre_control) + self.pre_pred = self.model.predict(X=self.pre_design["control"]) # calculate the counterfactual - self.post_pred = self.model.predict(X=self.datapost_control) + self.post_pred = self.model.predict(X=self.post_design["control"]) self.pre_impact = self.model.calculate_impact( - self.datapre_treated, self.pre_pred + self.pre_design["treated"], self.pre_pred ) self.post_impact = self.model.calculate_impact( - self.datapost_treated, self.post_pred + self.post_design["treated"], self.post_pred ) self.post_impact_cumulative = self.model.calculate_cumulative_impact( @@ -336,7 +346,9 @@ def _pre_treatment_correlations(self) -> dict[str, float]: """ correlations: dict[str, float] = {} for unit in self.treated_units: - observed = self.datapre_treated.sel(treated_units=unit).values.flatten() + observed = ( + self.pre_design["treated"].sel(treated_units=unit).values.flatten() + ) if isinstance(self.model, PyMCModel): predicted = ( self.pre_pred["posterior_predictive"]["mu"] @@ -428,7 +440,7 @@ def _bayesian_plot( # Plot observations for primary treated unit (h,) = ax[0].plot( self.datapre.index, - self.datapre_treated.sel(treated_units=treated_unit), + self.pre_design["treated"].sel(treated_units=treated_unit), "k.", label="Observations", ) @@ -447,14 +459,14 @@ def _bayesian_plot( ax[0].plot( self.datapost.index, - self.datapost_treated.sel(treated_units=treated_unit), + self.post_design["treated"].sel(treated_units=treated_unit), "k.", ) # Shaded causal effect for primary treated unit h = ax[0].fill_between( self.datapost.index, y1=post_pred.mean(dim=["chain", "draw"]).values, - y2=self.datapost_treated.sel(treated_units=treated_unit).values, + y2=self.post_design["treated"].sel(treated_units=treated_unit).values, color="C0", alpha=0.25, label="Causal impact", @@ -520,14 +532,14 @@ def _bayesian_plot( # plot control units as well ax[0].plot( self.datapre.index, - self.datapre_control, + self.pre_design["control"], "-", c=[0.8, 0.8, 0.8], zorder=1, ) ax[0].plot( self.datapost.index, - self.datapost_control, + self.post_design["control"], "-", c=[0.8, 0.8, 0.8], zorder=1, @@ -574,13 +586,13 @@ def _ols_plot( fig, ax = plt.subplots(3, 1, sharex=True, figsize=(7, 8)) ax[0].plot( - self.datapre_treated["obs_ind"], - self.datapre_treated.sel(treated_units=treated_unit), + self.pre_design["treated"]["obs_ind"], + self.pre_design["treated"].sel(treated_units=treated_unit), "k.", ) ax[0].plot( - self.datapost_treated["obs_ind"], - self.datapost_treated.sel(treated_units=treated_unit), + self.post_design["treated"]["obs_ind"], + self.post_design["treated"].sel(treated_units=treated_unit), "k.", ) @@ -599,7 +611,9 @@ def _ols_plot( ax[0].fill_between( self.datapost.index, y1=post_pred_values, - y2=np.squeeze(self.datapost_treated.sel(treated_units=treated_unit).data), + y2=np.squeeze( + self.post_design["treated"].sel(treated_units=treated_unit).data + ), color="C0", alpha=0.25, label="Causal impact", diff --git a/causalpy/maketables_adapters.py b/causalpy/maketables_adapters.py index 046f15de3..2d1210580 100644 --- a/causalpy/maketables_adapters.py +++ b/causalpy/maketables_adapters.py @@ -57,7 +57,10 @@ def default_stat_keys(self, experiment: Any) -> list[str] | None: def _safe_observation_count(experiment: Any) -> int | None: """Best-effort observation count across experiment classes.""" - for attr in ("X", "data", "datapre", "datapost"): + design = getattr(experiment, "design", None) + if isinstance(design, xr.Dataset) and "X" in design: + return int(design["X"].shape[0]) + for attr in ("data", "datapre", "datapost"): obj = getattr(experiment, attr, None) if obj is None: continue diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py index f4fdd5ce9..6dbea3c04 100644 --- a/causalpy/pymc_models.py +++ b/causalpy/pymc_models.py @@ -75,6 +75,11 @@ class PyMCModel(pm.Model): methods like `fit`, `predict`, and `score`. It also provides other methods which are useful for causal inference. + The base :meth:`_data_setter` assumes the model graph contains mutable data + nodes named ``"X"`` (predictors) and ``"y"`` (target). Subclasses that use + different data nodes should override :meth:`_data_setter`. See + :class:`BayesianBasisExpansionTimeSeries` for an example. + Example ------- >>> import causalpy as cp @@ -253,18 +258,24 @@ def build_model( def _data_setter(self, X: xr.DataArray) -> None: """ - Set data for the model. + Set data for the model for prediction. - This method is used internally to register new data for the model for - prediction. + This method is called by :meth:`predict` to register new predictor data + and reshape the target placeholder so that ``pm.sample_posterior_predictive`` + can run with the new observation count. - NOTE: We are actively changing the `X`. Often, this matrix will have a different - number of rows than the original data. So to make the shapes work, we need to - update all data nodes in the model to have the correct shape. The values are not - used, so we set them to 0. In our case, we just have data nodes X and y, but if - in the future we get more complex models with more data nodes, then we'll need - to update all of them - ideally programmatically. + The base implementation updates mutable data nodes named ``"X"`` and + ``"y"``. Subclasses that use different data nodes should override this + method. See :class:`BayesianBasisExpansionTimeSeries` for an example. """ + for name in ("X", "y"): + if name not in self.named_vars: + raise ValueError( + f"Data node '{name}' not found in model. " + f"If your model uses different data node names, " + f"override _data_setter()." + ) + new_no_of_observations = X.shape[0] # Use integer indices for obs_ind to avoid datetime compatibility issues with PyMC @@ -277,7 +288,6 @@ def _data_setter(self, X: xr.DataArray) -> None: ) n_treated_units = len(treated_units_coord) - # Always use 2D format for consistency pm.set_data( {"X": X, "y": np.zeros((new_no_of_observations, n_treated_units))}, coords={"obs_ind": obs_coords}, diff --git a/causalpy/reporting.py b/causalpy/reporting.py index 0eef0f379..ef297ea09 100644 --- a/causalpy/reporting.py +++ b/causalpy/reporting.py @@ -1478,10 +1478,12 @@ def _compute_statistics_did_ols( # Calculate standard error from model residuals # Get fitted values and residuals - y_pred = result.model.predict(result.X) - residuals = result.y - y_pred + X_da = result.design["X"] + y_da = result.design["y"] + y_pred = result.model.predict(X_da) + residuals = y_da - y_pred mse = np.mean(residuals**2) - n, p = result.X.shape + n, p = X_da.shape df = n - p # Find the interaction term coefficient index @@ -1497,8 +1499,7 @@ def _compute_statistics_did_ols( if coeff_idx is None: raise ValueError(f"Could not find interaction term {interaction_term} in model") - # Calculate standard error for this coefficient - X = result.X + X = X_da try: # Try to get X as numpy array if hasattr(X, "values"): @@ -1610,10 +1611,12 @@ def _compute_statistics_rd_ols(result, alpha=0.05): discontinuity = result.discontinuity_at_threshold # scalar # Calculate standard error from model - y_pred = result.model.predict(result.X) - residuals = result.y - y_pred + X_da = result.design["X"] + y_da = result.design["y"] + y_pred = result.model.predict(X_da) + residuals = y_da - y_pred mse = np.mean(residuals**2) - n, p = result.X.shape + n, p = X_da.shape df = n - p # Find the treated coefficient index @@ -1624,11 +1627,9 @@ def _compute_statistics_rd_ols(result, alpha=0.05): break if coeff_idx is None: - # Fallback: use simple approximation se = np.std(residuals) / np.sqrt(n) else: - # Calculate standard error for this coefficient - X = result.X + X = X_da try: if hasattr(X, "values"): X = X.values diff --git a/causalpy/tests/test_deprecated_design_aliases.py b/causalpy/tests/test_deprecated_design_aliases.py new file mode 100644 index 000000000..3424c051e --- /dev/null +++ b/causalpy/tests/test_deprecated_design_aliases.py @@ -0,0 +1,188 @@ +# Copyright 2022 - 2026 The PyMC Labs Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Backward-compatibility tests for deprecated design-matrix aliases. + +Each test verifies that: +1. Accessing the deprecated attribute triggers ``DeprecationWarning``. +2. The data returned by the deprecated path is identical to the new API. +""" + +import warnings + +import pandas as pd +import pytest +import xarray.testing as xrt +from sklearn.linear_model import LinearRegression + +import causalpy as cp + +# --------------------------------------------------------------------------- +# Helpers – lightweight experiment instances (OLS for speed) +# --------------------------------------------------------------------------- + + +def _make_did() -> cp.DifferenceInDifferences: + df = cp.load_data("did") + return cp.DifferenceInDifferences( + df, + formula="y ~ 1 + group * post_treatment", + time_variable_name="t", + group_variable_name="group", + model=LinearRegression(), + ) + + +def _make_rd() -> cp.RegressionDiscontinuity: + df = cp.load_data("rd") + return cp.RegressionDiscontinuity( + df, + formula="y ~ 1 + x + treated + x:treated", + treatment_threshold=0.5, + model=LinearRegression(), + ) + + +def _make_its(mock_pymc_sample) -> cp.InterruptedTimeSeries: + df = ( + cp.load_data("its") + .assign(date=lambda x: pd.to_datetime(x["date"])) + .set_index("date") + ) + return cp.InterruptedTimeSeries( + df, + treatment_time=pd.to_datetime("2017-01-01"), + formula="y ~ 1 + t", + model=cp.pymc_models.LinearRegression( + sample_kwargs={"random_seed": 42, "progressbar": False} + ), + ) + + +def _make_sc(mock_pymc_sample) -> cp.SyntheticControl: + df = cp.load_data("sc") + treatment_time = 70 + return cp.SyntheticControl( + df, + treatment_time, + control_units=["a", "b", "c", "d", "e", "f", "g"], + treated_units=["actual"], + model=cp.pymc_models.WeightedSumFitter( + sample_kwargs={ + "target_accept": 0.95, + "random_seed": 42, + "progressbar": False, + } + ), + ) + + +# --------------------------------------------------------------------------- +# Parametrised tests – formula-based (design["X"] / design["y"]) +# --------------------------------------------------------------------------- + +_FORMULA_CASES = [ + ("X", "design", "X"), + ("y", "design", "y"), +] + + +@pytest.mark.parametrize("old_attr,dataset_attr,key", _FORMULA_CASES) +def test_deprecated_alias_did(old_attr, dataset_attr, key): + result = _make_did() + with pytest.warns(DeprecationWarning, match=old_attr): + old_val = getattr(result, old_attr) + new_val = getattr(result, dataset_attr)[key] + xrt.assert_identical(old_val, new_val) + + +@pytest.mark.parametrize("old_attr,dataset_attr,key", _FORMULA_CASES) +def test_deprecated_alias_rd(old_attr, dataset_attr, key): + result = _make_rd() + with pytest.warns(DeprecationWarning, match=old_attr): + old_val = getattr(result, old_attr) + new_val = getattr(result, dataset_attr)[key] + xrt.assert_identical(old_val, new_val) + + +# --------------------------------------------------------------------------- +# Parametrised tests – pre/post split (ITS) +# --------------------------------------------------------------------------- + +_ITS_CASES = [ + ("pre_X", "pre_design", "X"), + ("pre_y", "pre_design", "y"), + ("post_X", "post_design", "X"), + ("post_y", "post_design", "y"), +] + + +@pytest.mark.parametrize("old_attr,dataset_attr,key", _ITS_CASES) +def test_deprecated_alias_its(mock_pymc_sample, old_attr, dataset_attr, key): + result = _make_its(mock_pymc_sample) + with pytest.warns(DeprecationWarning, match=old_attr): + old_val = getattr(result, old_attr) + new_val = getattr(result, dataset_attr)[key] + xrt.assert_identical(old_val, new_val) + + +# --------------------------------------------------------------------------- +# Parametrised tests – synthetic control +# --------------------------------------------------------------------------- + +_SC_CASES = [ + ("datapre_control", "pre_design", "control"), + ("datapre_treated", "pre_design", "treated"), + ("datapost_control", "post_design", "control"), + ("datapost_treated", "post_design", "treated"), +] + + +@pytest.mark.parametrize("old_attr,dataset_attr,key", _SC_CASES) +def test_deprecated_alias_sc(mock_pymc_sample, old_attr, dataset_attr, key): + result = _make_sc(mock_pymc_sample) + with pytest.warns(DeprecationWarning, match=old_attr): + old_val = getattr(result, old_attr) + new_val = getattr(result, dataset_attr)[key] + xrt.assert_identical(old_val, new_val) + + +# --------------------------------------------------------------------------- +# ConvexHullCheck should NOT trigger any DeprecationWarning +# --------------------------------------------------------------------------- + + +def test_convex_hull_check_no_deprecation_warning(mock_pymc_sample): + """ConvexHullCheck.run() must not trigger DeprecationWarning internally.""" + from causalpy.checks.convex_hull import ConvexHullCheck + from causalpy.pipeline import PipelineContext + + sc = _make_sc(mock_pymc_sample) + check = ConvexHullCheck() + ctx = PipelineContext(data=sc.data) + + with warnings.catch_warnings(): + warnings.simplefilter("error", DeprecationWarning) + check.run(sc, ctx) + + +# --------------------------------------------------------------------------- +# AttributeError for truly missing attributes +# --------------------------------------------------------------------------- + + +def test_attribute_error_for_missing(): + """Accessing a truly missing attribute still raises AttributeError.""" + result = _make_did() + with pytest.raises(AttributeError, match="no_such_attribute"): + _ = result.no_such_attribute diff --git a/causalpy/tests/test_integration_pymc_examples.py b/causalpy/tests/test_integration_pymc_examples.py index afe9231c3..012827b1c 100644 --- a/causalpy/tests/test_integration_pymc_examples.py +++ b/causalpy/tests/test_integration_pymc_examples.py @@ -1514,10 +1514,10 @@ def test_multi_unit_initialization(self, multi_unit_sc_data): assert sc.treatment_time == treatment_time # Check data shapes - assert sc.datapre_treated.shape == (40, len(treated_units)) - assert sc.datapost_treated.shape == (20, len(treated_units)) - assert sc.datapre_control.shape == (40, len(control_units)) - assert sc.datapost_control.shape == (20, len(control_units)) + assert sc.pre_design["treated"].shape == (40, len(treated_units)) + assert sc.post_design["treated"].shape == (20, len(treated_units)) + assert sc.pre_design["control"].shape == (40, len(control_units)) + assert sc.post_design["control"].shape == (20, len(control_units)) @pytest.mark.integration def test_multi_unit_scoring(self, multi_unit_sc_data): diff --git a/causalpy/tests/test_maketables_plugin.py b/causalpy/tests/test_maketables_plugin.py index 40072c265..d06df52ae 100644 --- a/causalpy/tests/test_maketables_plugin.py +++ b/causalpy/tests/test_maketables_plugin.py @@ -566,8 +566,13 @@ def test_returns_count_from_data_attr(self): stub = _Stub(data=np.zeros((42, 3))) assert _safe_observation_count(stub) == 42 - def test_returns_count_from_X_attr(self): - stub = _Stub(X=np.zeros((10, 2))) + def test_returns_count_from_design_dataset(self): + import xarray as xr + + design = xr.Dataset( + {"X": xr.DataArray(np.zeros((10, 2)), dims=["obs_ind", "coeffs"])} + ) + stub = _Stub(design=design) assert _safe_observation_count(stub) == 10 def test_returns_none_when_no_attrs(self): @@ -575,7 +580,7 @@ def test_returns_none_when_no_attrs(self): assert _safe_observation_count(stub) is None def test_skips_none_attrs(self): - stub = _Stub(data=None, X=None, datapre=np.zeros((5, 1))) + stub = _Stub(data=None, datapre=np.zeros((5, 1))) assert _safe_observation_count(stub) == 5 def test_skips_attr_without_shape(self): diff --git a/causalpy/tests/test_piecewise_its.py b/causalpy/tests/test_piecewise_its.py index 935be175c..c82439914 100644 --- a/causalpy/tests/test_piecewise_its.py +++ b/causalpy/tests/test_piecewise_its.py @@ -974,11 +974,12 @@ def test_piecewise_its_instance_attributes(): assert result.time_col == "t" assert result.outcome_variable_name == "y" - # Check X and y are xarray DataArrays - assert hasattr(result.X, "dims") - assert hasattr(result.y, "dims") - assert "obs_ind" in result.X.dims - assert "coeffs" in result.X.dims + # Check design Dataset contains X and y DataArrays + assert hasattr(result, "design") + assert "X" in result.design + assert "y" in result.design + assert "obs_ind" in result.design["X"].dims + assert "coeffs" in result.design["X"].dims # Check design info stored assert hasattr(result, "_x_design_info") @@ -1409,13 +1410,13 @@ def test_piecewise_its_x_y_shapes(): ) # X should be (n_obs, n_coeffs) - assert result.X.shape == (100, 4) + assert result.design["X"].shape == (100, 4) # y should be (n_obs, 1) for treated_units - assert result.y.shape == (100, 1) + assert result.design["y"].shape == (100, 1) # Check coordinates - assert list(result.X.coords["coeffs"].values) == result.labels + assert list(result.design["X"].coords["coeffs"].values) == result.labels def test_piecewise_its_y_pred_shape(): diff --git a/causalpy/tests/test_pymc_models.py b/causalpy/tests/test_pymc_models.py index 1f076cf5a..1b9b7de56 100644 --- a/causalpy/tests/test_pymc_models.py +++ b/causalpy/tests/test_pymc_models.py @@ -186,6 +186,90 @@ def test_fit_predict(self, coords, rng, mock_pymc_sample) -> None: assert isinstance(predictions, az.InferenceData) +class NonStandardDataModel(PyMCModel): + """Subclass using non-default data node names without overriding _data_setter.""" + + def build_model(self, X, y, coords): + with self: + if "treated_units" not in coords: + coords = coords.copy() if coords else {} + coords["treated_units"] = ["unit_0"] + self.add_coords(coords) + X_ = pm.Data(name="X_design", value=X, dims=["obs_ind", "coeffs"]) + y_ = pm.Data(name="y_obs", value=y, dims=["obs_ind", "treated_units"]) + beta = pm.Normal("beta", mu=0, sigma=1, dims=["treated_units", "coeffs"]) + sigma = pm.HalfNormal("y_hat_sigma", sigma=1, dims="treated_units") + mu = pm.Deterministic( + "mu", pm.math.dot(X_, beta.T), dims=["obs_ind", "treated_units"] + ) + pm.Normal( + "y_hat", + mu=mu, + sigma=sigma, + observed=y_, + dims=["obs_ind", "treated_units"], + ) + + +class TestDataSetterValidation: + """Tests for _data_setter validation of expected data nodes.""" + + @pytest.fixture() + def xy_and_coords(self, rng): + X = xr.DataArray( + rng.normal(size=(20, 2)), + dims=["obs_ind", "coeffs"], + coords={"obs_ind": np.arange(20), "coeffs": ["x1", "x2"]}, + ) + y = xr.DataArray( + rng.normal(size=(20, 1)), + dims=["obs_ind", "treated_units"], + coords={"obs_ind": np.arange(20), "treated_units": ["unit_0"]}, + ) + coords = { + "obs_ind": np.arange(20), + "coeffs": ["x1", "x2"], + "treated_units": ["unit_0"], + } + return X, y, coords + + def test_mismatched_X_raises(self, xy_and_coords, mock_pymc_sample): + """Subclass with non-standard data node names gets a clear ValueError.""" + X, y, coords = xy_and_coords + model = NonStandardDataModel(sample_kwargs={"chains": 2, "draws": 2}) + model.fit(X, y, coords=coords) + with pytest.raises(ValueError, match="Data node 'X' not found"): + model.predict(X=X) + + def test_missing_y_raises(self, xy_and_coords, mock_pymc_sample): + """When only y is renamed, error message references 'y'.""" + + class OnlyYRenamed(PyMCModel): + def build_model(self, X, y, coords): + with self: + coords = coords.copy() if coords else {} + coords.setdefault("treated_units", ["unit_0"]) + self.add_coords(coords) + X_ = pm.Data("X", X, dims=["obs_ind", "coeffs"]) + y_ = pm.Data("outcome", y, dims=["obs_ind", "treated_units"]) + beta = pm.Normal( + "beta", mu=0, sigma=1, dims=["treated_units", "coeffs"] + ) + sigma = pm.HalfNormal("y_hat_sigma", sigma=1, dims="treated_units") + mu = pm.Deterministic( + "mu", + pm.math.dot(X_, beta.T), + dims=["obs_ind", "treated_units"], + ) + pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) + + X, y, coords = xy_and_coords + model = OnlyYRenamed(sample_kwargs={"chains": 2, "draws": 2}) + model.fit(X, y, coords=coords) + with pytest.raises(ValueError, match="Data node 'y' not found"): + model.predict(X=X) + + def test_idata_property(mock_pymc_sample, did_data): """Test that we can access the idata property of the model""" result = cp.DifferenceInDifferences(