From 14b51a3acc2918e90b0a5247310800b565e96c9b Mon Sep 17 00:00:00 2001 From: Mossab Arektout Date: Sat, 9 May 2026 20:40:54 +0100 Subject: [PATCH] feat: add AutoML.get_feature_importance --- supervised/automl.py | 19 ++++ supervised/base_automl.py | 95 +++++++++++++++++++ .../test_feature_importance_api.py | 76 +++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 tests/tests_automl/test_feature_importance_api.py diff --git a/supervised/automl.py b/supervised/automl.py index 527e5b1d..2009bb48 100644 --- a/supervised/automl.py +++ b/supervised/automl.py @@ -538,6 +538,25 @@ def report(self, width=900, height=1200): def report_structured(self, format="markdown", model_name=None): return self._report_structured(format, model_name) + def get_feature_importance( + self, + model: Union[str, Literal["best"], Literal["all"]] = "best", + kind: Literal["raw", "normalized"] = "raw", + ): + """ + Get feature importance for AutoML models. + + Arguments: + model (str): Can be "best", "all", or a model name. + kind (str): Can be "raw" or "normalized". + + Returns: + pandas.DataFrame or dict: + - DataFrame with columns: ["feature", "importance"] for "best" or model name. + - Dict {model_name: DataFrame} for "all". + """ + return self._get_feature_importance(model=model, kind=kind) + def need_retrain( self, X: Union[numpy.ndarray, pandas.DataFrame], diff --git a/supervised/base_automl.py b/supervised/base_automl.py index e182d450..784abc83 100644 --- a/supervised/base_automl.py +++ b/supervised/base_automl.py @@ -46,7 +46,9 @@ from supervised.utils.jsonencoder import MLJSONEncoder from supervised.utils.leaderboard_plots import LeaderboardPlots from supervised.utils.metric import Metric, UserDefinedEvalMetric +from supervised.utils.importance import PermutationImportance from supervised.utils.report_structured import ( + _load_model_importance_vector, build_compact_view, build_structured_report, save_structured_report, @@ -288,6 +290,99 @@ def get_leaderboard( return ldb + def _normalize_importance(self, importance): + min_value = float(importance.min()) + max_value = float(importance.max()) + if max_value == min_value: + return pd.Series(np.zeros(len(importance)), index=importance.index) + return (importance - min_value) / (max_value - min_value) + + def _aggregate_feature_importance(self, model_name): + if self._results_path is None: + return None + model_dir = os.path.join(self._results_path, model_name) + importance = _load_model_importance_vector(model_dir) + if importance is None: + return None + return importance.sort_values(ascending=False) + + def _compute_feature_importance_for_model(self, model): + if not isinstance(model, ModelFramework): + return False + + X_path = self._X_path or os.path.join(self._results_path, "X.data") + y_path = self._y_path or os.path.join(self._results_path, "y.data") + + try: + X = load_data(X_path) + y_data = load_data(y_path) + y = y_data["target"] if isinstance(y_data, pd.DataFrame) else y_data + model_path = os.path.join(self._results_path, model.get_name()) + for learner, preproces in zip(model.learners, model.preprocessings): + learner.reload() + X_data, y_data, _ = preproces.transform(X.copy(), y.copy(), None) + PermutationImportance.compute_and_plot( + learner, + X_data, + y_data, + model_path, + learner.name, + metric_name=model.get_metric_name(), + ml_task=self._ml_task, + n_jobs=self._n_jobs, + ) + + return True + except Exception: + return False + + def _model_feature_importance(self, model, kind): + model_name = model.get_name() + importance = self._aggregate_feature_importance(model_name) + if importance is None: + self._compute_feature_importance_for_model(model) + importance = self._aggregate_feature_importance(model_name) + + if importance is None: + return pd.DataFrame(columns=["feature", "importance"]) + + if kind == "normalized": + importance = self._normalize_importance(importance) + + return ( + pd.DataFrame({"feature": importance.index, "importance": importance.values}) + .sort_values("importance", ascending=False) + .reset_index(drop=True) + ) + + def _get_feature_importance(self, model="best", kind="raw"): + if kind not in ["raw", "normalized"]: + raise AutoMLException( + "Invalid kind parameter. Allowed values are: 'raw', 'normalized'." + ) + + if self._best_model is None and self.results_path is not None: + self._check_can_load() + if self._best_model is None: + raise AutoMLException( + "This model has not been fitted yet. Please call `fit()` first." + ) + + if model == "best": + return self._model_feature_importance(self._best_model, kind) + + if model == "all": + return { + m.get_name(): self._model_feature_importance(m, kind) for m in self._models + } + + selected = [m for m in self._models if m.get_name() == model] + if not selected: + raise AutoMLException( + f"Model `{model}` not found. Use 'best', 'all' or a valid model name." + ) + return self._model_feature_importance(selected[0], kind) + def keep_model(self, model, model_subpath): if model is None: return diff --git a/tests/tests_automl/test_feature_importance_api.py b/tests/tests_automl/test_feature_importance_api.py new file mode 100644 index 00000000..ed50e8a8 --- /dev/null +++ b/tests/tests_automl/test_feature_importance_api.py @@ -0,0 +1,76 @@ +import os +import shutil +import unittest + +import pandas as pd +import pytest +from sklearn import datasets + +from supervised import AutoML + + +iris = datasets.load_iris() + + +@pytest.mark.usefixtures("data_folder") +class AutoMLFeatureImportanceApiTest(unittest.TestCase): + automl_dir = "AutoMLFeatureImportanceApiTest" + + def tearDown(self): + shutil.rmtree(self.automl_dir, ignore_errors=True) + + def setUp(self): + shutil.rmtree(self.automl_dir, ignore_errors=True) + + def _train(self): + automl = AutoML( + results_path=self.automl_dir, + mode="Explain", + explain_level=1, + algorithms=["Decision Tree"], + train_ensemble=False, + random_state=1, + verbose=0, + ) + automl.fit(iris.data, iris.target) + return automl + + def test_get_feature_importance_best(self): + automl = self._train() + fi = automl.get_feature_importance() + self.assertTrue(isinstance(fi, pd.DataFrame)) + self.assertTrue("feature" in fi.columns) + self.assertTrue("importance" in fi.columns) + self.assertGreater(fi.shape[0], 0) + + def test_get_feature_importance_all_and_model_name(self): + automl = self._train() + fi_all = automl.get_feature_importance(model="all") + self.assertTrue(isinstance(fi_all, dict)) + self.assertTrue(automl._best_model.get_name() in fi_all) + + fi_model = automl.get_feature_importance(model=automl._best_model.get_name()) + self.assertTrue(isinstance(fi_model, pd.DataFrame)) + self.assertGreater(fi_model.shape[0], 0) + + def test_get_feature_importance_normalized(self): + automl = self._train() + fi = automl.get_feature_importance(kind="normalized") + self.assertGreater(fi.shape[0], 0) + self.assertGreaterEqual(fi["importance"].min(), 0.0) + self.assertLessEqual(fi["importance"].max(), 1.0) + + def test_recompute_feature_importance_if_missing(self): + automl = self._train() + model_name = automl._best_model.get_name() + model_path = os.path.join(self.automl_dir, model_name) + + for f in os.listdir(model_path): + if "_importance.csv" in f and "shap" not in f: + os.remove(os.path.join(model_path, f)) + + fi = automl.get_feature_importance(model=model_name) + self.assertTrue(isinstance(fi, pd.DataFrame)) + self.assertTrue("feature" in fi.columns) + self.assertTrue("importance" in fi.columns) + self.assertGreater(fi.shape[0], 0)