Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,25 @@ def report(self, width=900, height=1200):
def report_structured(self, format="markdown", model_name=None):
return self._report_structured(format, model_name)

def get_feature_importance(
self,
model: Union[str, Literal["best"], Literal["all"]] = "best",
kind: Literal["raw", "normalized"] = "raw",
):
"""
Get feature importance for AutoML models.

Arguments:
model (str): Can be "best", "all", or a model name.
kind (str): Can be "raw" or "normalized".

Returns:
pandas.DataFrame or dict:
- DataFrame with columns: ["feature", "importance"] for "best" or model name.
- Dict {model_name: DataFrame} for "all".
"""
return self._get_feature_importance(model=model, kind=kind)

def need_retrain(
self,
X: Union[numpy.ndarray, pandas.DataFrame],
Expand Down
95 changes: 95 additions & 0 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@
from supervised.utils.jsonencoder import MLJSONEncoder
from supervised.utils.leaderboard_plots import LeaderboardPlots
from supervised.utils.metric import Metric, UserDefinedEvalMetric
from supervised.utils.importance import PermutationImportance
from supervised.utils.report_structured import (
_load_model_importance_vector,
build_compact_view,
build_structured_report,
save_structured_report,
Expand Down Expand Up @@ -288,6 +290,99 @@ def get_leaderboard(

return ldb

def _normalize_importance(self, importance):
min_value = float(importance.min())
max_value = float(importance.max())
if max_value == min_value:
return pd.Series(np.zeros(len(importance)), index=importance.index)
return (importance - min_value) / (max_value - min_value)

def _aggregate_feature_importance(self, model_name):
if self._results_path is None:
return None
model_dir = os.path.join(self._results_path, model_name)
importance = _load_model_importance_vector(model_dir)
if importance is None:
return None
return importance.sort_values(ascending=False)

def _compute_feature_importance_for_model(self, model):
if not isinstance(model, ModelFramework):
return False

X_path = self._X_path or os.path.join(self._results_path, "X.data")
y_path = self._y_path or os.path.join(self._results_path, "y.data")

try:
X = load_data(X_path)
y_data = load_data(y_path)
y = y_data["target"] if isinstance(y_data, pd.DataFrame) else y_data
model_path = os.path.join(self._results_path, model.get_name())
for learner, preproces in zip(model.learners, model.preprocessings):
learner.reload()
X_data, y_data, _ = preproces.transform(X.copy(), y.copy(), None)
PermutationImportance.compute_and_plot(
learner,
X_data,
y_data,
model_path,
learner.name,
metric_name=model.get_metric_name(),
ml_task=self._ml_task,
n_jobs=self._n_jobs,
)

return True
except Exception:
return False

def _model_feature_importance(self, model, kind):
model_name = model.get_name()
importance = self._aggregate_feature_importance(model_name)
if importance is None:
self._compute_feature_importance_for_model(model)
importance = self._aggregate_feature_importance(model_name)

if importance is None:
return pd.DataFrame(columns=["feature", "importance"])

if kind == "normalized":
importance = self._normalize_importance(importance)

return (
pd.DataFrame({"feature": importance.index, "importance": importance.values})
.sort_values("importance", ascending=False)
.reset_index(drop=True)
)

def _get_feature_importance(self, model="best", kind="raw"):
if kind not in ["raw", "normalized"]:
raise AutoMLException(
"Invalid kind parameter. Allowed values are: 'raw', 'normalized'."
)

if self._best_model is None and self.results_path is not None:
self._check_can_load()
if self._best_model is None:
raise AutoMLException(
"This model has not been fitted yet. Please call `fit()` first."
)

if model == "best":
return self._model_feature_importance(self._best_model, kind)

if model == "all":
return {
m.get_name(): self._model_feature_importance(m, kind) for m in self._models
}

selected = [m for m in self._models if m.get_name() == model]
if not selected:
raise AutoMLException(
f"Model `{model}` not found. Use 'best', 'all' or a valid model name."
)
return self._model_feature_importance(selected[0], kind)

def keep_model(self, model, model_subpath):
if model is None:
return
Expand Down
76 changes: 76 additions & 0 deletions tests/tests_automl/test_feature_importance_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import shutil
import unittest

import pandas as pd
import pytest
from sklearn import datasets

from supervised import AutoML


iris = datasets.load_iris()


@pytest.mark.usefixtures("data_folder")
class AutoMLFeatureImportanceApiTest(unittest.TestCase):
automl_dir = "AutoMLFeatureImportanceApiTest"

def tearDown(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def setUp(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def _train(self):
automl = AutoML(
results_path=self.automl_dir,
mode="Explain",
explain_level=1,
algorithms=["Decision Tree"],
train_ensemble=False,
random_state=1,
verbose=0,
)
automl.fit(iris.data, iris.target)
return automl

def test_get_feature_importance_best(self):
automl = self._train()
fi = automl.get_feature_importance()
self.assertTrue(isinstance(fi, pd.DataFrame))
self.assertTrue("feature" in fi.columns)
self.assertTrue("importance" in fi.columns)
self.assertGreater(fi.shape[0], 0)

def test_get_feature_importance_all_and_model_name(self):
automl = self._train()
fi_all = automl.get_feature_importance(model="all")
self.assertTrue(isinstance(fi_all, dict))
self.assertTrue(automl._best_model.get_name() in fi_all)

fi_model = automl.get_feature_importance(model=automl._best_model.get_name())
self.assertTrue(isinstance(fi_model, pd.DataFrame))
self.assertGreater(fi_model.shape[0], 0)

def test_get_feature_importance_normalized(self):
automl = self._train()
fi = automl.get_feature_importance(kind="normalized")
self.assertGreater(fi.shape[0], 0)
self.assertGreaterEqual(fi["importance"].min(), 0.0)
self.assertLessEqual(fi["importance"].max(), 1.0)

def test_recompute_feature_importance_if_missing(self):
automl = self._train()
model_name = automl._best_model.get_name()
model_path = os.path.join(self.automl_dir, model_name)

for f in os.listdir(model_path):
if "_importance.csv" in f and "shap" not in f:
os.remove(os.path.join(model_path, f))

fi = automl.get_feature_importance(model=model_name)
self.assertTrue(isinstance(fi, pd.DataFrame))
self.assertTrue("feature" in fi.columns)
self.assertTrue("importance" in fi.columns)
self.assertGreater(fi.shape[0], 0)