diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index f271e073cfd3..dc1a0869b30c 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -3,8 +3,11 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md """ -from . import tracker # noqa -from . import collective +from . import ( + collective, + interpret, + tracker, # noqa +) from ._c_api import _py_version from .core import ( Booster, @@ -62,4 +65,6 @@ "XGBRFRegressor", # collective "collective", + # interpretability + "interpret", ] diff --git a/python-package/xgboost/interpret.py b/python-package/xgboost/interpret.py new file mode 100644 index 000000000000..22790743a73d --- /dev/null +++ b/python-package/xgboost/interpret.py @@ -0,0 +1,123 @@ +"""Interpretability functions for XGBoost models.""" + +from typing import Optional, Tuple, Union + +import numpy as np + +from ._typing import ArrayLike, FloatCompatible, IterationRange +from .core import Booster, DMatrix + + +def _as_booster(model: object) -> Booster: + if isinstance(model, Booster): + return model + get_booster = getattr(model, "get_booster", None) + if not callable(get_booster): + raise TypeError( + "`model` must be an xgboost.Booster or an object with get_booster()." + ) + booster = get_booster() + if not isinstance(booster, Booster): + raise TypeError("`model.get_booster()` must return an xgboost.Booster.") + return booster + + +def _get_iteration_range( + model: object, iteration_range: Optional[IterationRange] +) -> IterationRange: + get_iteration_range = getattr(model, "_get_iteration_range", None) + if get_iteration_range is not None: + return get_iteration_range(iteration_range) + if iteration_range is None: + return (0, 0) + return iteration_range + + +def _as_prediction_dmatrix( + model: object, X: Union[DMatrix, ArrayLike], missing: Optional[FloatCompatible] +) -> DMatrix: + if isinstance(X, DMatrix): + if missing is not None: + raise ValueError("`missing` must not be specified when `X` is a DMatrix.") + return X + + return DMatrix( + X, + missing=missing if missing is not None else getattr(model, "missing", None), + nthread=getattr(model, "n_jobs", None), + feature_types=getattr(model, "feature_types", None), + enable_categorical=getattr(model, "enable_categorical", False), + ) + + +def shap_values( # pylint: disable=too-many-arguments + model: object, + X: Union[DMatrix, ArrayLike], + *, + X_background: Optional[Union[DMatrix, ArrayLike]] = None, + output_margin: bool = False, + iteration_range: Optional[IterationRange] = None, + missing: Optional[FloatCompatible] = None, + validate_features: bool = True, +) -> Tuple[np.ndarray, np.ndarray]: + """Return SHAP values for an XGBoost model. + + This function accepts either a :py:class:`xgboost.Booster` or an sklearn-style + XGBoost model and returns feature contributions together with the separated + bias term. + + Parameters + ---------- + model : + XGBoost booster or sklearn-style XGBoost model. + X : + Input data. + X_background : + Background data for interventional SHAP values. This is reserved for a + future implementation and is currently unsupported. + output_margin : + Accepted for API compatibility. SHAP contributions currently correspond + to the model margin. + iteration_range : + Specifies which layer of trees are used in prediction. + missing : + Value in array-like ``X`` to treat as missing. When None, use the + model's missing value if available, otherwise ``np.nan``. This must not + be specified when ``X`` is already a DMatrix. + validate_features : + Validate feature names between the model and input data. + + Returns + ------- + values, bias : + ``values`` contains feature SHAP values with the bias term removed. + ``bias`` contains the separated bias term. For multi-target models, the + output shape follows the corresponding prediction shape with the final + feature dimension split into ``values`` and ``bias``. + + Notes + ----- + To use GPU algorithms, configure the model before calling this function, for + example with ``booster.set_param({"device": "cuda"})``. + """ + if X_background is not None: + raise NotImplementedError("`X_background` is not yet supported.") + # SHAP contributions currently correspond to the model margin. Keep this + # argument in the initial API so callers can use the proposed signature. + _ = output_margin + + booster = _as_booster(model) + data = _as_prediction_dmatrix(model, X, missing) + contribs = booster.predict( + data, + pred_contribs=True, + validate_features=validate_features, + iteration_range=_get_iteration_range(model, iteration_range), + ) + + values = contribs[..., :-1] + bias = contribs[..., -1] + return values, bias + + +__all__ = ["shap_values"] diff --git a/tests/python/test_interpret.py b/tests/python/test_interpret.py new file mode 100644 index 000000000000..4b70382325e7 --- /dev/null +++ b/tests/python/test_interpret.py @@ -0,0 +1,88 @@ +import numpy as np +import pytest +import xgboost as xgb +from xgboost import interpret + + +def test_shap_values_matches_predict() -> None: + rng = np.random.RandomState(1994) + X = rng.randn(16, 4) + y = rng.randn(16) + booster = xgb.train({"tree_method": "hist"}, xgb.DMatrix(X, label=y), 4) + + values, bias = interpret.shap_values(booster, X) + contribs = booster.predict(xgb.DMatrix(X), pred_contribs=True) + + np.testing.assert_allclose(values, contribs[:, :-1]) + np.testing.assert_allclose(bias, contribs[:, -1]) + + +def test_shap_values_accepts_sklearn_model() -> None: + rng = np.random.RandomState(1995) + X = rng.randn(16, 4) + y = rng.randn(16) + reg = xgb.XGBRegressor(n_estimators=4, tree_method="hist") + reg.fit(X, y) + + values, bias = interpret.shap_values(reg, X) + contribs = reg.get_booster().predict(xgb.DMatrix(X), pred_contribs=True) + + np.testing.assert_allclose(values, contribs[:, :-1]) + np.testing.assert_allclose(bias, contribs[:, -1]) + + +def test_shap_values_uses_sklearn_iteration_range() -> None: + rng = np.random.RandomState(1996) + X = rng.randn(64, 4) + y = rng.randn(64) + reg = xgb.XGBRegressor(n_estimators=8, tree_method="hist") + reg.fit(X, y) + reg.get_booster().set_attr(best_iteration="3") + + values, bias = interpret.shap_values(reg, X, iteration_range=(0, 0)) + contribs = reg.get_booster().predict( + xgb.DMatrix(X), pred_contribs=True, iteration_range=(0, 4) + ) + + np.testing.assert_allclose(values, contribs[:, :-1]) + np.testing.assert_allclose(bias, contribs[:, -1]) + + +def test_shap_values_rejects_background_data() -> None: + rng = np.random.RandomState(1997) + X = rng.randn(16, 4) + y = rng.randn(16) + booster = xgb.train({"tree_method": "hist"}, xgb.DMatrix(X, label=y), 4) + + with pytest.raises(NotImplementedError, match="X_background"): + interpret.shap_values(booster, X, X_background=X) + + +def test_shap_values_validates_get_booster() -> None: + class InvalidModel: + get_booster = "booster" + + with pytest.raises(TypeError, match="get_booster"): + interpret.shap_values(InvalidModel(), np.empty((1, 1))) + + +def test_shap_values_uses_missing_for_array_like_data() -> None: + X = np.array([[0.0, 1.0], [2.0, 0.0], [3.0, 4.0]]) + y = np.array([0.0, 1.0, 1.0]) + booster = xgb.train( + {"tree_method": "hist"}, xgb.DMatrix(X, label=y, missing=0.0), 4 + ) + + values, bias = interpret.shap_values(booster, X, missing=0.0) + contribs = booster.predict(xgb.DMatrix(X, missing=0.0), pred_contribs=True) + + np.testing.assert_allclose(values, contribs[:, :-1]) + np.testing.assert_allclose(bias, contribs[:, -1]) + + +def test_shap_values_rejects_missing_with_dmatrix() -> None: + X = xgb.DMatrix(np.array([[0.0, 1.0]]), label=np.array([0.0]), missing=0.0) + booster = xgb.train({"tree_method": "hist"}, X, 1) + + with pytest.raises(ValueError, match="DMatrix"): + interpret.shap_values(booster, X, missing=0.0)