Skip to content

Commit 1f93dbf

Browse files
committed
Add sklearn estimators feature_importances_ attribute
This attribute quantifies the importance of each of the _input_ features, in their order of occurrence in the input dataset: - if the feature is used, then its importance is retrieved from the report (as the average of its exact Shapley values across the training dataset) - else, the importance is set to 0.0.
1 parent 2386d76 commit 1f93dbf

5 files changed

Lines changed: 147 additions & 22 deletions

File tree

doc/samples/samples_sklearn.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@ Samples
7676
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
7777
print("---")
7878
79+
print("Top 5 used features, among those present in the dataset")
80+
for feature, importance in sorted(
81+
zip(khc.feature_names_in_, khc.feature_importances_),
82+
key=lambda imp: float(imp[1]),
83+
reverse=True,
84+
)[:5]:
85+
print(f"{feature} - Importance: {importance}")
86+
print("---")
87+
7988
# Predict the classes on the test dataset
8089
y_test_pred = khc.predict(X_test)
8190
print("Predicted classes (first 10):")

khiops/samples/samples_sklearn.ipynb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@
6262
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
6363
"print(\"---\")\n",
6464
"\n",
65+
"print(\"Top 5 used features, among those present in the dataset\")\n",
66+
"for feature, importance in sorted(\n",
67+
" zip(khc.feature_names_in_, khc.feature_importances_),\n",
68+
" key=lambda imp: float(imp[1]),\n",
69+
" reverse=True,\n",
70+
")[:5]:\n",
71+
" print(f\"{feature} - Importance: {importance}\")\n",
72+
"print(\"---\")\n",
73+
"\n",
6574
"# Predict the classes on the test dataset\n",
6675
"y_test_pred = khc.predict(X_test)\n",
6776
"print(\"Predicted classes (first 10):\")\n",

khiops/samples/samples_sklearn.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ def khiops_classifier():
6565
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
6666
print("---")
6767

68+
print("Top 5 used features, among those present in the dataset")
69+
for feature, importance in sorted(
70+
zip(khc.feature_names_in_, khc.feature_importances_),
71+
key=lambda imp: float(imp[1]),
72+
reverse=True,
73+
)[:5]:
74+
print(f"{feature} - Importance: {importance}")
75+
print("---")
76+
6877
# Predict the classes on the test dataset
6978
y_test_pred = khc.predict(X_test)
7079
print("Predicted classes (first 10):")

khiops/sklearn/estimators.py

Lines changed: 99 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
427427
and hasattr(self, "model_report_")
428428
and isinstance(self.model_report_, kh.KhiopsJSONObject)
429429
):
430+
self.feature_names_in_ = ds.main_table.column_ids
430431
self._fit_training_post_process(ds)
431432
self.is_multitable_model_ = ds.is_multitable
432433
self.n_features_in_ = ds.main_table.n_features()
@@ -1597,6 +1598,36 @@ def __init__(
15971598
self.n_evaluated_features = n_evaluated_features
15981599
self.n_selected_features = n_selected_features
15991600

1601+
def _fit_training_post_process(self, ds):
1602+
# Call the parent's method
1603+
super()._fit_training_post_process(ds)
1604+
1605+
# Extract statistics, about the selected features, from the modeling report
1606+
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
1607+
if modeling_report.selected_variables is not None:
1608+
feature_used_names_, feature_used_importances_ = (
1609+
self.get_feature_used_statistics(modeling_report)
1610+
)
1611+
self.feature_used_names_ = feature_used_names_
1612+
self.feature_used_importances_ = feature_used_importances_
1613+
self.n_features_used_ = len(self.feature_used_names_)
1614+
1615+
# feature_used_names_ is not set if no variable is selected in the model
1616+
feature_used_names = getattr(self, "feature_used_names_", [])
1617+
1618+
# Compute feature importances
1619+
feature_importances = []
1620+
for feature_name in self.feature_names_in_:
1621+
if feature_name in feature_used_names:
1622+
feature_index = np.where(feature_used_names == feature_name)
1623+
feature_importance = self.feature_used_importances_[
1624+
feature_index
1625+
].ravel()[2]
1626+
else:
1627+
feature_importance = 0.0
1628+
feature_importances.append(feature_importance)
1629+
self.feature_importances_ = np.array(feature_importances)
1630+
16001631
def __sklearn_tags__(self):
16011632
# If we don't implement this trivial method it's not found by the sklearn. This
16021633
# is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,39 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
17951826
classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
17961827
The list of classes seen in training. Depending on the training target, the
17971828
contents are ``int`` or ``str``.
1829+
n_features_in_ : int
1830+
The number of features in the main table of the training dataset.
1831+
feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
1832+
Names of the features in the main table of the training dataset.
1833+
feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
1834+
Importances of the features provided to the classifier, in the main
1835+
table of the training dataset. The importance of each feature is
1836+
calculated as follows:
1837+
1838+
- if the feature is used by the classifier, then its importance is the
1839+
average of its exact Shapley values across the training dataset.
1840+
1841+
- if the feature is not used by the classifier, then its importance
1842+
is 0.0.
1843+
1844+
.. warning::
1845+
Since Khiops is an AutoML suite, it uses generated features on its
1846+
predictors (e.g. regularized decision trees). This implies that there
1847+
is no direct link between the native features and its importance
1848+
when AutoML features are used, as an important feature might not be
1849+
selected, but a generated feature might (e.g. a tree containing an
1850+
important variable).
1851+
1852+
To ensure that the ``feature_importances_`` attribute has
1853+
`the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
1854+
one must disable most AutoML capabilities of Khiops, namely:
1855+
1856+
- the training dataset must be monotable;
1857+
- no timestamp column should be used in the training dataset;
1858+
- the ``n_trees`` parameter must be set to 0;
1859+
- the ``n_pairs`` parameter must be left to its default value, 0;
1860+
- the ``n_text_features`` parameter must be set to 0.
1861+
17981862
n_features_evaluated_ : int
17991863
The number of features evaluated by the classifier.
18001864
feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1881,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
18171881
to all features selected by the classifier. It ranges between 0 (little
18181882
contribution to the model) and 1 (large contribution to the model).
18191883
1820-
- Importance: The geometric mean between the Level and the Weight.
1884+
- Importance: Average of the exact Shapley values of each used feature
1885+
across the training data.
18211886
18221887
is_multitable_model_ : bool
18231888
``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2094,6 @@ def _fit_training_post_process(self, ds):
20292094
if key.startswith("TargetProb"):
20302095
variable.used = True
20312096

2032-
# Extract statistics, about the selected features, from the modeling report
2033-
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
2034-
if modeling_report.selected_variables is not None:
2035-
feature_used_names_, feature_used_importances_ = (
2036-
self.get_feature_used_statistics(modeling_report)
2037-
)
2038-
self.feature_used_names_ = feature_used_names_
2039-
self.feature_used_importances_ = feature_used_importances_
2040-
self.n_features_used_ = len(self.feature_used_names_)
2041-
20422097
def predict(self, X):
20432098
"""Predicts the most probable class for the test dataset X
20442099
@@ -2208,6 +2263,37 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22082263
22092264
Attributes
22102265
----------
2266+
n_features_in_ : int
2267+
The number of features in the main table of the training dataset.
2268+
feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
2269+
Names of the features in the main table of the training dataset.
2270+
feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
2271+
Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
2272+
2273+
- if the feature is used by the classifier, then its importance is the
2274+
average of its exact Shapley values across the training dataset.
2275+
2276+
- if the feature is not used by the classifier, then its importance
2277+
is 0.0.
2278+
2279+
.. warning::
2280+
Since Khiops is an AutoML suite, it uses generated features on its
2281+
predictors (e.g. regularized decision trees). This implies that there
2282+
is no direct link between the native features and its importance
2283+
when AutoML features are used, as an important feature might not be
2284+
selected, but a generated feature might (e.g. a tree containing an
2285+
important variable).
2286+
2287+
To ensure that the ``feature_importances_`` attribute has
2288+
`the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
2289+
one must disable most AutoML capabilities of Khiops, namely:
2290+
2291+
- the training dataset must be monotable;
2292+
- no timestamp column should be used in the training dataset;
2293+
- the ``n_trees`` parameter must be set to 0;
2294+
- the ``n_pairs`` parameter must be left to its default value, 0;
2295+
- the ``n_text_features`` parameter must be set to 0.
2296+
22112297
n_features_evaluated_ : int
22122298
The number of features evaluated by the classifier.
22132299
feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2316,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22302316
to all features selected by the classifier. It ranges between 0 (little
22312317
contribution to the model) and 1 (large contribution to the model).
22322318
2233-
- Importance: The geometric mean between the Level and the Weight.
2319+
- Importance: Average of the exact Shapley values of each used feature
2320+
across the training data.
22342321
22352322
is_multitable_model_ : bool
22362323
``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2422,6 @@ def _fit_training_post_process(self, ds):
23352422
for variable_name in variables_to_eliminate:
23362423
self._get_main_dictionary().remove_variable(variable_name)
23372424

2338-
# Extract statistics, about the selected features, from the modeling report
2339-
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
2340-
if modeling_report.selected_variables is not None:
2341-
feature_used_names_, feature_used_importances_ = (
2342-
self.get_feature_used_statistics(modeling_report)
2343-
)
2344-
self.feature_used_names_ = feature_used_names_
2345-
self.feature_used_importances_ = feature_used_importances_
2346-
self.n_features_used_ = len(self.feature_used_names_)
2347-
23482425
def _check_target_type(self, ds):
23492426
_check_numerical_target_type(ds)
23502427

tests/test_estimator_attributes.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pandas as pd
1414

1515
from khiops import core as kh
16+
from khiops.sklearn.dataset import Dataset
1617
from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor
1718

1819
# Disable PEP8 variable names because of scikit-learn X,y conventions
@@ -74,6 +75,7 @@ def assert_attribute_values_ok(self, model, X, y):
7475
self.assertEqual(model.classes_.tolist(), sorted(y.unique()))
7576
self.assertEqual(model.n_classes_, len(y.unique()))
7677
self.assertEqual(model.n_features_in_, len(X.columns))
78+
self.assertEqual(model.feature_names_in_.tolist(), X.columns.tolist())
7779

7880
# Extract the features and their levels from the report
7981
# TODO: Eliminate this as this is the implementation
@@ -162,6 +164,25 @@ def assert_attribute_values_ok(self, model, X, y):
162164
model.n_features_used_, len(feature_used_importances_report)
163165
)
164166

167+
# Test input feature names and importances
168+
ds = Dataset(X)
169+
feature_names_in_dataset = ds.main_table.column_ids
170+
self.assertEqual(
171+
model.feature_names_in_.tolist(), feature_names_in_dataset.tolist()
172+
)
173+
feature_importances_report = []
174+
for feature_name in feature_names_in_dataset:
175+
if feature_name in feature_used_names:
176+
feature_index = feature_used_names.index(feature_name)
177+
feature_importances_report.append(
178+
feature_used_importances_report[feature_index][2]
179+
)
180+
else:
181+
feature_importances_report.append(0.0)
182+
self.assertEqual(
183+
model.feature_importances_.tolist(), feature_importances_report
184+
)
185+
165186
def test_classifier_attributes_monotable(self):
166187
"""Test consistency of KhiopsClassifier's attributes with the output reports
167188

0 commit comments

Comments
 (0)