Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions causalpy/checks/convex_hull.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def run(
) -> CheckResult:
"""Run the convex hull violation check on pre-treatment data."""
sc = experiment
datapre_control = sc.datapre_control # type: ignore[attr-defined]
datapre_treated = sc.datapre_treated # type: ignore[attr-defined]
datapre_control = sc.pre_design["control"] # type: ignore[attr-defined]
datapre_treated = sc.pre_design["treated"] # type: ignore[attr-defined]
Comment on lines +53 to +54
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tiny nit: since applicable_methods = {SyntheticControl} and validate() already enforces the type, you could drop both # type: ignore[attr-defined] markers by adding assert isinstance(experiment, SyntheticControl) (or just calling self.validate(experiment)) at the top of run() — that narrows sc to SyntheticControl and pre_design becomes a known attribute for mypy. Same effect, no escape hatches.


all_results = []
total_violations = 0
Expand Down
64 changes: 64 additions & 0 deletions causalpy/experiments/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
from __future__ import annotations

import contextlib
import warnings
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Literal

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from sklearn.base import RegressorMixin

from causalpy.maketables_adapters import get_maketables_adapter
Expand Down Expand Up @@ -114,6 +117,67 @@ class BaseExperiment(ABC):

_default_model_class: type[PyMCModel] | None = None

_deprecated_design_aliases: dict[str, tuple[str, str]] = {}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick one — InversePropensityWeighting, InstrumentalVariable, and StaggeredDiD didn't get migrated to design (still using numpy self.X / self.y, see e.g. causalpy/experiments/inverse_propensity_weighting.py:111, instrumental_variable.py:155, staggered_did.py:332-338) and accordingly don't show up in _deprecated_design_aliases here. Assuming intentional given their non-standard layouts (IPW has t + outcome, IV has Z + endogenous treatment, staggered has the train/full split)? If so, maybe worth a sentence in the PR body so it's not mistaken for an oversight.

"""Mapping of ``old_attr -> (dataset_attr, key)`` for deprecated design
matrix accessors. Subclasses populate this so that
``__getattr__`` can forward accesses with a deprecation warning."""

def __getattr__(self, name: str) -> Any:
aliases = type(self)._deprecated_design_aliases
if name in aliases:
dataset_attr, key = aliases[name]
warnings.warn(
f"{name} is deprecated, use {dataset_attr}['{key}']",
DeprecationWarning,
stacklevel=2,
)
return getattr(self, dataset_attr)[key]
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)

@staticmethod
def _build_design_dataset(
X_raw: np.ndarray,
y_raw: np.ndarray,
*,
obs_ind: np.ndarray | pd.Index,
coeffs: list[str],
treated_units: list[str] | None = None,
) -> xr.Dataset:
"""Build a standard ``xr.Dataset`` from raw design matrices.

Parameters
----------
X_raw : np.ndarray
Predictor matrix, shape ``(n_obs, n_coeffs)``.
y_raw : np.ndarray
Outcome matrix, shape ``(n_obs, n_units)``.
obs_ind : array-like
Observation index coordinates.
coeffs : list[str]
Coefficient / column names for ``X_raw``.
treated_units : list[str], optional
Names for the treated-unit dimension of ``y_raw``.
Defaults to ``["unit_0"]``.
"""
if treated_units is None:
treated_units = ["unit_0"]
return xr.Dataset(
{
"X": xr.DataArray(
X_raw,
dims=["obs_ind", "coeffs"],
coords={"obs_ind": obs_ind, "coeffs": coeffs},
),
"y": xr.DataArray(
y_raw,
dims=["obs_ind", "treated_units"],
coords={"obs_ind": obs_ind, "treated_units": treated_units},
),
}
)

def __init__(self, model: PyMCModel | RegressorMixin | None = None) -> None:
# Ensure we've made any provided Scikit Learn model (as identified as being type
# RegressorMixin) compatible with CausalPy by appending our custom methods.
Expand Down
37 changes: 16 additions & 21 deletions causalpy/experiments/diff_in_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class DifferenceInDifferences(BaseExperiment):
supports_ols = True
supports_bayes = True
_default_model_class = LinearRegression
_deprecated_design_aliases = {"X": ("design", "X"), "y": ("design", "y")}

def __init__(
self,
Expand Down Expand Up @@ -130,42 +131,36 @@ def _build_design_matrices(self) -> None:
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self._y_raw, self._X_raw = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays."""
self.X = xr.DataArray(
self.X,
dims=["obs_ind", "coeffs"],
coords={
"obs_ind": np.arange(self.X.shape[0]),
"coeffs": self.labels,
},
)
self.y = xr.DataArray(
self.y,
dims=["obs_ind", "treated_units"],
coords={"obs_ind": np.arange(self.y.shape[0]), "treated_units": ["unit_0"]},
"""Bundle design matrices into an ``xr.Dataset``."""
n = self._X_raw.shape[0]
self.design = self._build_design_dataset(
self._X_raw,
self._y_raw,
obs_ind=np.arange(n),
coeffs=self.labels,
)
del self._X_raw, self._y_raw

def algorithm(self) -> None:
"""Run the experiment algorithm: fit model, predict, and calculate causal impact."""
# fit model
X = self.design["X"]
y = self.design["y"]

if isinstance(self.model, PyMCModel):
COORDS = {
"coeffs": self.labels,
"obs_ind": np.arange(self.X.shape[0]),
"obs_ind": np.arange(X.shape[0]),
"treated_units": ["unit_0"],
}
self.model.fit(X=self.X, y=self.y, coords=COORDS)
self.model.fit(X=X, y=y, coords=COORDS)
elif isinstance(self.model, RegressorMixin):
# Ensure the intercept is part of the coefficients array rather than
# a separate intercept_ attribute. See #664 / PR #693 for
# centralising this in BaseExperiment.
if hasattr(self.model, "fit_intercept"):
self.model.fit_intercept = False
self.model.fit(X=self.X, y=self.y)
self.model.fit(X=X, y=y)
else:
raise ValueError("Model type not recognized")

Expand Down
115 changes: 47 additions & 68 deletions causalpy/experiments/interrupted_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ class InterruptedTimeSeries(BaseExperiment):
supports_ols = True
supports_bayes = True
_default_model_class = LinearRegression
_deprecated_design_aliases = {
"pre_X": ("pre_design", "X"),
"pre_y": ("pre_design", "y"),
"post_X": ("post_design", "X"),
"post_y": ("post_design", "y"),
}

def __init__(
self,
Expand All @@ -139,9 +145,8 @@ def __init__(
**kwargs: Any,
) -> None:
super().__init__(model=model)
self.pre_y: xr.DataArray
self.post_y: xr.DataArray
# rename the index to "obs_ind"
self.pre_design: xr.Dataset
self.post_design: xr.Dataset
data.index.name = "obs_ind"
self.data = data
self.input_validation(data, treatment_time, treatment_end_time)
Expand All @@ -155,96 +160,76 @@ def __init__(

def _build_design_matrices(self) -> None:
"""Build design matrices for pre and post intervention periods using patsy."""
# set things up with pre-intervention data
y, X = dmatrices(self.formula, self.datapre)
self.outcome_variable_name = y.design_info.column_names[0]
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
# process post-intervention data
self._pre_y_raw, self._pre_X_raw = np.asarray(y), np.asarray(X)
(new_y, new_x) = build_design_matrices(
[self._y_design_info, self._x_design_info], self.datapost
)
self.post_X = np.asarray(new_x)
self.post_y = np.asarray(new_y)
self._post_X_raw = np.asarray(new_x)
self._post_y_raw = np.asarray(new_y)

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays for pre and post periods."""
self.pre_X = xr.DataArray(
self.pre_X,
dims=["obs_ind", "coeffs"],
coords={
"obs_ind": self.datapre.index,
"coeffs": self.labels,
},
)
self.pre_y = xr.DataArray(
self.pre_y, # Keep 2D shape
dims=["obs_ind", "treated_units"],
coords={"obs_ind": self.datapre.index, "treated_units": ["unit_0"]},
)
self.post_X = xr.DataArray(
self.post_X,
dims=["obs_ind", "coeffs"],
coords={
"obs_ind": self.datapost.index,
"coeffs": self.labels,
},
"""Bundle design matrices into ``xr.Dataset`` objects for pre and post periods."""
self.pre_design = self._build_design_dataset(
self._pre_X_raw,
self._pre_y_raw,
obs_ind=self.datapre.index,
coeffs=self.labels,
)
self.post_y = xr.DataArray(
self.post_y, # Keep 2D shape
dims=["obs_ind", "treated_units"],
coords={"obs_ind": self.datapost.index, "treated_units": ["unit_0"]},
self.post_design = self._build_design_dataset(
self._post_X_raw,
self._post_y_raw,
obs_ind=self.datapost.index,
coeffs=self.labels,
)
del self._pre_X_raw, self._pre_y_raw, self._post_X_raw, self._post_y_raw

def algorithm(self) -> None:
"""Run the experiment algorithm: fit model, predict, and calculate causal impact."""
# fit the model to the observed (pre-intervention) data
# All PyMC models now accept xr.DataArray with consistent API
pre_X = self.pre_design["X"]
pre_y = self.pre_design["y"]
post_X = self.post_design["X"]
post_y = self.post_design["y"]

if isinstance(self.model, PyMCModel):
COORDS: dict[str, Any] = {
"coeffs": self.labels,
"obs_ind": np.arange(self.pre_X.shape[0]),
"obs_ind": np.arange(pre_X.shape[0]),
"treated_units": ["unit_0"],
"datetime_index": self.datapre.index, # For time series models
"datetime_index": self.datapre.index,
}
self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
self.model.fit(X=pre_X, y=pre_y, coords=COORDS)
elif isinstance(self.model, RegressorMixin):
# For OLS models, use 1D y data
self.model.fit(X=self.pre_X, y=self.pre_y.isel(treated_units=0))
self.model.fit(X=pre_X, y=pre_y.isel(treated_units=0))
else:
raise ValueError("Model type not recognized")

# score the goodness of fit to the pre-intervention data
if isinstance(self.model, PyMCModel):
self.score = self.model.score(X=self.pre_X, y=self.pre_y)
self.score = self.model.score(X=pre_X, y=pre_y)
elif isinstance(self.model, RegressorMixin):
self.score = self.model.score(
X=self.pre_X, y=self.pre_y.isel(treated_units=0)
)
self.score = self.model.score(X=pre_X, y=pre_y.isel(treated_units=0))

# get the model predictions of the observed (pre-intervention) data
if isinstance(self.model, PyMCModel | RegressorMixin):
self.pre_pred = self.model.predict(X=self.pre_X)
self.pre_pred = self.model.predict(X=pre_X)

# calculate the counterfactual (post period)
if isinstance(self.model, PyMCModel):
self.post_pred = self.model.predict(X=self.post_X, out_of_sample=True)
self.post_pred = self.model.predict(X=post_X, out_of_sample=True)
elif isinstance(self.model, RegressorMixin):
self.post_pred = self.model.predict(X=self.post_X)
self.post_pred = self.model.predict(X=post_X)

# calculate impact - all PyMC models now use 2D data with treated_units
if isinstance(self.model, PyMCModel):
self.pre_impact = self.model.calculate_impact(self.pre_y, self.pre_pred)
self.post_impact = self.model.calculate_impact(self.post_y, self.post_pred)
self.pre_impact = self.model.calculate_impact(pre_y, self.pre_pred)
self.post_impact = self.model.calculate_impact(post_y, self.post_pred)
elif isinstance(self.model, RegressorMixin):
# SKL models work with 1D data
self.pre_impact = self.model.calculate_impact(
self.pre_y.isel(treated_units=0), self.pre_pred
pre_y.isel(treated_units=0), self.pre_pred
)
self.post_impact = self.model.calculate_impact(
self.post_y.isel(treated_units=0), self.post_pred
post_y.isel(treated_units=0), self.post_pred
)

self.post_impact_cumulative = self.model.calculate_cumulative_impact(
Expand Down Expand Up @@ -625,9 +610,7 @@ def _bayesian_plot(

(h,) = ax[0].plot(
self.datapre.index,
self.pre_y.isel(treated_units=0)
if hasattr(self.pre_y, "isel")
else self.pre_y[:, 0],
self.pre_design["y"].isel(treated_units=0),
"k.",
label="Observations",
)
Expand All @@ -652,9 +635,7 @@ def _bayesian_plot(

ax[0].plot(
self.datapost.index,
self.post_y.isel(treated_units=0)
if hasattr(self.post_y, "isel")
else self.post_y[:, 0],
self.post_design["y"].isel(treated_units=0),
"k.",
)
# Shaded causal effect
Expand All @@ -667,9 +648,7 @@ def _bayesian_plot(
h = ax[0].fill_between(
self.datapost.index,
y1=post_pred_mu,
y2=self.post_y.isel(treated_units=0)
if hasattr(self.post_y, "isel")
else self.post_y[:, 0],
y2=self.post_design["y"].isel(treated_units=0),
color="C0",
alpha=0.25,
)
Expand Down Expand Up @@ -805,10 +784,10 @@ def _ols_plot(

fig, ax = plt.subplots(3, 1, sharex=True, figsize=(7, 8))

ax[0].plot(self.datapre.index, self.pre_y, "k.")
ax[0].plot(self.datapre.index, self.pre_design["y"], "k.")
ax[0].plot(self.datapre.index, self.pre_pred, c="k", label="model fit")

ax[0].plot(self.datapost.index, self.post_y, "k.")
ax[0].plot(self.datapost.index, self.post_design["y"], "k.")
ax[0].plot(
self.datapost.index,
self.post_pred,
Expand All @@ -820,7 +799,7 @@ def _ols_plot(
ax[0].fill_between(
self.datapost.index,
y1=np.squeeze(self.post_pred),
y2=np.squeeze(self.post_y),
y2=np.squeeze(self.post_design["y"]),
color="C0",
alpha=0.25,
label="Causal impact",
Expand Down
Loading
Loading