Skip to content

Commit e5dbf4c

Browse files
committed
Add sklearn estimators feature_importances_ attribute
TODO: - remove feature_evaluated_names_, n_features_evaluated_ and feature_evaluated_importances_ - drop level and weight from feature_used_importances_ - accordingly, update tests and sklearn samples to reflect these changes. The rationale of these changes is that the evaluated features, as well as the level and weight of the used features characterize the analysis process, not the model itself. Hence, they should not be stored on the estimator, which represents the model rather than the analysis process.
1 parent cee3313 commit e5dbf4c

5 files changed

Lines changed: 131 additions & 22 deletions

File tree

doc/samples/samples_sklearn.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@ Samples
7676
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
7777
print("---")
7878
79+
print("Top 5 used features, among those present in the dataset")
80+
for feature, importance in sorted(
81+
zip(khc.feature_names_in_, khc.feature_importances_),
82+
key=lambda feature_importance: float(feature_importance[1]),
83+
reverse=True,
84+
)[:5]:
85+
print(f"{feature} - Importance: {importance}")
86+
print("---")
87+
7988
# Predict the classes on the test dataset
8089
y_test_pred = khc.predict(X_test)
8190
print("Predicted classes (first 10):")

khiops/samples/samples_sklearn.ipynb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@
6262
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
6363
"print(\"---\")\n",
6464
"\n",
65+
"print(\"Top 5 used features, among those present in the dataset\")\n",
66+
"for feature, importance in sorted(\n",
67+
" zip(khc.feature_names_in_, khc.feature_importances_),\n",
68+
" key=lambda feature_importance: float(feature_importance[1]),\n",
69+
" reverse=True,\n",
70+
")[:5]:\n",
71+
" print(f\"{feature} - Importance: {importance}\")\n",
72+
"print(\"---\")\n",
73+
"\n",
6574
"# Predict the classes on the test dataset\n",
6675
"y_test_pred = khc.predict(X_test)\n",
6776
"print(\"Predicted classes (first 10):\")\n",

khiops/samples/samples_sklearn.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ def khiops_classifier():
6565
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
6666
print("---")
6767

68+
print("Top 5 used features, among those present in the dataset")
69+
for feature, importance in sorted(
70+
zip(khc.feature_names_in_, khc.feature_importances_),
71+
key=lambda feature_importance: float(feature_importance[1]),
72+
reverse=True,
73+
)[:5]:
74+
print(f"{feature} - Importance: {importance}")
75+
print("---")
76+
6877
# Predict the classes on the test dataset
6978
y_test_pred = khc.predict(X_test)
7079
print("Predicted classes (first 10):")

khiops/sklearn/estimators.py

Lines changed: 83 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
427427
and hasattr(self, "model_report_")
428428
and isinstance(self.model_report_, kh.KhiopsJSONObject)
429429
):
430+
self.feature_names_in_ = ds.main_table.column_ids
430431
self._fit_training_post_process(ds)
431432
self.is_multitable_model_ = ds.is_multitable
432433
self.n_features_in_ = ds.main_table.n_features()
@@ -1597,6 +1598,36 @@ def __init__(
15971598
self.n_evaluated_features = n_evaluated_features
15981599
self.n_selected_features = n_selected_features
15991600

1601+
def _fit_training_post_process(self, ds):
1602+
# Call the parent's method
1603+
super()._fit_training_post_process(ds)
1604+
1605+
# Extract statistics, about the selected features, from the modeling report
1606+
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
1607+
if modeling_report.selected_variables is not None:
1608+
feature_used_names_, feature_used_importances_ = (
1609+
self.get_feature_used_statistics(modeling_report)
1610+
)
1611+
self.feature_used_names_ = feature_used_names_
1612+
self.feature_used_importances_ = feature_used_importances_
1613+
self.n_features_used_ = len(self.feature_used_names_)
1614+
1615+
# Compute feature importances
1616+
feature_importances = []
1617+
1618+
# feature_used_names_ is not set if no variable is selected in the model
1619+
feature_used_names = getattr(self, "feature_used_names_", [])
1620+
for feature_name in self.feature_names_in_:
1621+
if feature_name in feature_used_names:
1622+
feature_index = np.where(feature_used_names == feature_name)
1623+
feature_importance = self.feature_used_importances_[
1624+
feature_index
1625+
].ravel()[2]
1626+
else:
1627+
feature_importance = 0.0
1628+
feature_importances.append(feature_importance)
1629+
self.feature_importances_ = np.array(feature_importances)
1630+
16001631
def __sklearn_tags__(self):
16011632
# If we don't implement this trivial method it's not found by the sklearn. This
16021633
# is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,30 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
17951826
classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
17961827
The list of classes seen in training. Depending on the training target, the
17971828
contents are ``int`` or ``str``.
1829+
n_features_in_ : int
1830+
The number of features in the main table of the training dataset.
1831+
feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
1832+
Names of the features in the main table of the training dataset.
1833+
feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
1834+
Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
1835+
1836+
- if the feature is used by the classifier, then its importance is equal
1837+
to the average of its exact Shapley values across the training data.
1838+
1839+
- if the feature is not used by the classifier, then its importance
1840+
equals 0.0.
1841+
1842+
.. note::
1843+
In order to maximize the accuracy of the feature importances, the
1844+
estimator must be trained on monotable data, must not use trees,
1845+
feature pairs, text features or timestamps. More precisely:
1846+
1847+
- the training dataset must be monotable;
1848+
- no timestamp column should be used in the training dataset;
1849+
- the `n_trees` parameter must be set to 0;
1850+
- the `n_pairs` parameter must be left to its default value, 0;
1851+
- the `n_text_features` parameter must be set to 0.
1852+
17981853
n_features_evaluated_ : int
17991854
The number of features evaluated by the classifier.
18001855
feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1872,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
18171872
to all features selected by the classifier. It ranges between 0 (little
18181873
contribution to the model) and 1 (large contribution to the model).
18191874
1820-
- Importance: The geometric mean between the Level and the Weight.
1875+
- Importance: Average of the exact Shapley values of each used feature
1876+
across the training data.
18211877
18221878
is_multitable_model_ : bool
18231879
``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2085,6 @@ def _fit_training_post_process(self, ds):
20292085
if key.startswith("TargetProb"):
20302086
variable.used = True
20312087

2032-
# Extract statistics, about the selected features, from the modeling report
2033-
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
2034-
if modeling_report.selected_variables is not None:
2035-
feature_used_names_, feature_used_importances_ = (
2036-
self.get_feature_used_statistics(modeling_report)
2037-
)
2038-
self.feature_used_names_ = feature_used_names_
2039-
self.feature_used_importances_ = feature_used_importances_
2040-
self.n_features_used_ = len(self.feature_used_names_)
2041-
20422088
def predict(self, X):
20432089
"""Predicts the most probable class for the test dataset X
20442090
@@ -2208,6 +2254,30 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22082254
22092255
Attributes
22102256
----------
2257+
n_features_in_ : int
2258+
The number of features in the main table of the training dataset.
2259+
feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
2260+
Names of the features in the main table of the training dataset.
2261+
feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
2262+
Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
2263+
2264+
- if the feature is used by the classifier, then its importance is equal
2265+
to the average of its exact Shapley values across the training data.
2266+
2267+
- if the feature is not used by the classifier, then its importance
2268+
equals 0.0.
2269+
2270+
.. note::
2271+
In order to maximize the accuracy of the feature importances, the
2272+
estimator must be trained on monotable data, must not use trees,
2273+
feature pairs, text features or timestamps. More precisely:
2274+
2275+
- the training dataset must be monotable;
2276+
- no timestamp column should be used in the training dataset;
2277+
- the `n_trees` parameter must be set to 0;
2278+
- the `n_pairs` parameter must be left to its default value, 0;
2279+
- the `n_text_features` parameter must be set to 0.
2280+
22112281
n_features_evaluated_ : int
22122282
The number of features evaluated by the classifier.
22132283
feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2300,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22302300
to all features selected by the classifier. It ranges between 0 (little
22312301
contribution to the model) and 1 (large contribution to the model).
22322302
2233-
- Importance: The geometric mean between the Level and the Weight.
2303+
- Importance: Average of the exact Shapley values of each used feature
2304+
across the training data.
22342305
22352306
is_multitable_model_ : bool
22362307
``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2406,6 @@ def _fit_training_post_process(self, ds):
23352406
for variable_name in variables_to_eliminate:
23362407
self._get_main_dictionary().remove_variable(variable_name)
23372408

2338-
# Extract statistics, about the selected features, from the modeling report
2339-
modeling_report = self.model_report_.modeling_report.get_snb_predictor()
2340-
if modeling_report.selected_variables is not None:
2341-
feature_used_names_, feature_used_importances_ = (
2342-
self.get_feature_used_statistics(modeling_report)
2343-
)
2344-
self.feature_used_names_ = feature_used_names_
2345-
self.feature_used_importances_ = feature_used_importances_
2346-
self.n_features_used_ = len(self.feature_used_names_)
2347-
23482409
def _check_target_type(self, ds):
23492410
_check_numerical_target_type(ds)
23502411

tests/test_estimator_attributes.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pandas as pd
1414

1515
from khiops import core as kh
16+
from khiops.sklearn.dataset import Dataset
1617
from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor
1718

1819
# Disable PEP8 variable names because of scikit-learn X,y conventions
@@ -74,6 +75,7 @@ def assert_attribute_values_ok(self, model, X, y):
7475
self.assertEqual(model.classes_.tolist(), sorted(y.unique()))
7576
self.assertEqual(model.n_classes_, len(y.unique()))
7677
self.assertEqual(model.n_features_in_, len(X.columns))
78+
self.assertEqual(model.feature_names_in_.tolist(), X.columns.tolist())
7779

7880
# Extract the features and their levels from the report
7981
# TODO: Eliminate this as this is the implementation
@@ -162,6 +164,25 @@ def assert_attribute_values_ok(self, model, X, y):
162164
model.n_features_used_, len(feature_used_importances_report)
163165
)
164166

167+
ds = Dataset(X)
168+
feature_names_in_dataset = ds.main_table.column_ids
169+
self.assertEqual(
170+
model.feature_names_in_.tolist(), feature_names_in_dataset.tolist()
171+
)
172+
173+
feature_importances_report = []
174+
for feature_name in feature_names_in_dataset:
175+
if feature_name in feature_used_names:
176+
feature_index = feature_used_names.index(feature_name)
177+
feature_importances_report.append(
178+
feature_used_importances_report[feature_index][2]
179+
)
180+
else:
181+
feature_importances_report.append(0.0)
182+
self.assertEqual(
183+
model.feature_importances_.tolist(), feature_importances_report
184+
)
185+
165186
def test_classifier_attributes_monotable(self):
166187
"""Test consistency of KhiopsClassifier's attributes with the output reports
167188

0 commit comments

Comments
 (0)