Add sklearn estimators feature_importances_ attribute

popescu-v · popescu-v · commit 1f93dbf5598e · 2025-10-07T11:45:46.000+02:00
This attribute quantifies the importance of each of the _input_
features, in their order of occurrence in the input dataset:
- if the feature is used, then its importance is retrieved from the
  report (as the average of its exact Shapley values across the training
  dataset)
- else, the importance is set to 0.0.
diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -76,6 +76,15 @@ Samples
         print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
     print("---")
 
+    print("Top 5 used features, among those present in the dataset")
+    for feature, importance in sorted(
+        zip(khc.feature_names_in_, khc.feature_importances_),
+        key=lambda imp: float(imp[1]),
+        reverse=True,
+    )[:5]:
+        print(f"{feature} - Importance: {importance}")
+    print("---")
+
     # Predict the classes on the test dataset
     y_test_pred = khc.predict(X_test)
     print("Predicted classes (first 10):")
diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -62,6 +62,15 @@
     "    print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
     "print(\"---\")\n",
     "\n",
+    "print(\"Top 5 used features, among those present in the dataset\")\n",
+    "for feature, importance in sorted(\n",
+    "    zip(khc.feature_names_in_, khc.feature_importances_),\n",
+    "    key=lambda imp: float(imp[1]),\n",
+    "    reverse=True,\n",
+    ")[:5]:\n",
+    "    print(f\"{feature} - Importance: {importance}\")\n",
+    "print(\"---\")\n",
+    "\n",
     "# Predict the classes on the test dataset\n",
     "y_test_pred = khc.predict(X_test)\n",
     "print(\"Predicted classes (first 10):\")\n",
diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py
@@ -65,6 +65,15 @@ def khiops_classifier():
         print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
     print("---")
 
+    print("Top 5 used features, among those present in the dataset")
+    for feature, importance in sorted(
+        zip(khc.feature_names_in_, khc.feature_importances_),
+        key=lambda imp: float(imp[1]),
+        reverse=True,
+    )[:5]:
+        print(f"{feature} - Importance: {importance}")
+    print("---")
+
     # Predict the classes on the test dataset
     y_test_pred = khc.predict(X_test)
     print("Predicted classes (first 10):")
diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
             and hasattr(self, "model_report_")
             and isinstance(self.model_report_, kh.KhiopsJSONObject)
         ):
+            self.feature_names_in_ = ds.main_table.column_ids
             self._fit_training_post_process(ds)
             self.is_multitable_model_ = ds.is_multitable
             self.n_features_in_ = ds.main_table.n_features()
@@ -1597,6 +1598,36 @@ def __init__(
         self.n_evaluated_features = n_evaluated_features
         self.n_selected_features = n_selected_features
 
+    def _fit_training_post_process(self, ds):
+        # Call the parent's method
+        super()._fit_training_post_process(ds)
+
+        # Extract statistics, about the selected features, from the modeling report
+        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
+        if modeling_report.selected_variables is not None:
+            feature_used_names_, feature_used_importances_ = (
+                self.get_feature_used_statistics(modeling_report)
+            )
+            self.feature_used_names_ = feature_used_names_
+            self.feature_used_importances_ = feature_used_importances_
+            self.n_features_used_ = len(self.feature_used_names_)
+
+        # feature_used_names_ is not set if no variable is selected in the model
+        feature_used_names = getattr(self, "feature_used_names_", [])
+
+        # Compute feature importances
+        feature_importances = []
+        for feature_name in self.feature_names_in_:
+            if feature_name in feature_used_names:
+                feature_index = np.where(feature_used_names == feature_name)
+                feature_importance = self.feature_used_importances_[
+                    feature_index
+                ].ravel()[2]
+            else:
+                feature_importance = 0.0
+            feature_importances.append(feature_importance)
+        self.feature_importances_ = np.array(feature_importances)
+
     def __sklearn_tags__(self):
         # If we don't implement this trivial method it's not found by the sklearn. This
         # is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,39 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
     classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
         The list of classes seen in training. Depending on the training target, the
         contents are ``int`` or ``str``.
+    n_features_in_ : int
+        The number of features in the main table of the training dataset.
+    feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
+        Names of the features in the main table of the training dataset.
+    feature_importances_ :  `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
+        Importances of the features provided to the classifier, in the main
+        table of the training dataset. The importance of each feature is
+        calculated as follows:
+
+        - if the feature is used by the classifier, then its importance is the
+          average of its exact Shapley values across the training dataset.
+
+        - if the feature is not used by the classifier, then its importance
+          is 0.0.
+
+        .. warning::
+            Since Khiops is an AutoML suite, it uses generated features on its
+            predictors (e.g. regularized decision trees). This implies that there
+            is no direct link between the native features and its importance
+            when AutoML features are used, as an important feature might not be
+            selected, but a generated feature might (e.g. a tree containing an
+            important variable).
+
+            To ensure that the ``feature_importances_`` attribute has
+            `the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
+            one must disable most AutoML capabilities of Khiops, namely:
+
+             - the training dataset must be monotable;
+             - no timestamp column should be used in the training dataset;
+             - the ``n_trees`` parameter must be set to 0;
+             - the ``n_pairs`` parameter must be left to its default value, 0;
+             - the ``n_text_features`` parameter must be set to 0.
+
     n_features_evaluated_ : int
         The number of features evaluated by the classifier.
     feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1881,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
           to all features selected by the classifier. It ranges between 0 (little
           contribution to the model) and 1 (large contribution to the model).
 
-        - Importance: The geometric mean between the Level and the Weight.
+        - Importance: Average of the exact Shapley values of each used feature
+          across the training data.
 
     is_multitable_model_ : bool
         ``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2094,6 @@ def _fit_training_post_process(self, ds):
                 if key.startswith("TargetProb"):
                     variable.used = True
 
-        # Extract statistics, about the selected features, from the modeling report
-        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
-        if modeling_report.selected_variables is not None:
-            feature_used_names_, feature_used_importances_ = (
-                self.get_feature_used_statistics(modeling_report)
-            )
-            self.feature_used_names_ = feature_used_names_
-            self.feature_used_importances_ = feature_used_importances_
-            self.n_features_used_ = len(self.feature_used_names_)
-
     def predict(self, X):
         """Predicts the most probable class for the test dataset X
 
@@ -2208,6 +2263,37 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
 
     Attributes
     ----------
+    n_features_in_ : int
+        The number of features in the main table of the training dataset.
+    feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
+        Names of the features in the main table of the training dataset.
+    feature_importances_ :  `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
+        Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
+
+        - if the feature is used by the classifier, then its importance is the
+          average of its exact Shapley values across the training dataset.
+
+        - if the feature is not used by the classifier, then its importance
+          is 0.0.
+
+        .. warning::
+            Since Khiops is an AutoML suite, it uses generated features on its
+            predictors (e.g. regularized decision trees). This implies that there
+            is no direct link between the native features and its importance
+            when AutoML features are used, as an important feature might not be
+            selected, but a generated feature might (e.g. a tree containing an
+            important variable).
+
+            To ensure that the ``feature_importances_`` attribute has
+            `the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
+            one must disable most AutoML capabilities of Khiops, namely:
+
+             - the training dataset must be monotable;
+             - no timestamp column should be used in the training dataset;
+             - the ``n_trees`` parameter must be set to 0;
+             - the ``n_pairs`` parameter must be left to its default value, 0;
+             - the ``n_text_features`` parameter must be set to 0.
+
     n_features_evaluated_ : int
         The number of features evaluated by the classifier.
     feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2316,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
           to all features selected by the classifier. It ranges between 0 (little
           contribution to the model) and 1 (large contribution to the model).
 
-        - Importance: The geometric mean between the Level and the Weight.
+        - Importance: Average of the exact Shapley values of each used feature
+          across the training data.
 
     is_multitable_model_ : bool
         ``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2422,6 @@ def _fit_training_post_process(self, ds):
         for variable_name in variables_to_eliminate:
             self._get_main_dictionary().remove_variable(variable_name)
 
-        # Extract statistics, about the selected features, from the modeling report
-        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
-        if modeling_report.selected_variables is not None:
-            feature_used_names_, feature_used_importances_ = (
-                self.get_feature_used_statistics(modeling_report)
-            )
-            self.feature_used_names_ = feature_used_names_
-            self.feature_used_importances_ = feature_used_importances_
-            self.n_features_used_ = len(self.feature_used_names_)
-
     def _check_target_type(self, ds):
         _check_numerical_target_type(ds)
 
diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py
@@ -13,6 +13,7 @@
 import pandas as pd
 
 from khiops import core as kh
+from khiops.sklearn.dataset import Dataset
 from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor
 
 # Disable PEP8 variable names because of scikit-learn X,y conventions
@@ -74,6 +75,7 @@ def assert_attribute_values_ok(self, model, X, y):
             self.assertEqual(model.classes_.tolist(), sorted(y.unique()))
             self.assertEqual(model.n_classes_, len(y.unique()))
             self.assertEqual(model.n_features_in_, len(X.columns))
+            self.assertEqual(model.feature_names_in_.tolist(), X.columns.tolist())
 
         # Extract the features and their levels from the report
         # TODO: Eliminate this as this is the implementation
@@ -162,6 +164,25 @@ def assert_attribute_values_ok(self, model, X, y):
                 model.n_features_used_, len(feature_used_importances_report)
             )
 
+            # Test input feature names and importances
+            ds = Dataset(X)
+            feature_names_in_dataset = ds.main_table.column_ids
+            self.assertEqual(
+                model.feature_names_in_.tolist(), feature_names_in_dataset.tolist()
+            )
+            feature_importances_report = []
+            for feature_name in feature_names_in_dataset:
+                if feature_name in feature_used_names:
+                    feature_index = feature_used_names.index(feature_name)
+                    feature_importances_report.append(
+                        feature_used_importances_report[feature_index][2]
+                    )
+                else:
+                    feature_importances_report.append(0.0)
+            self.assertEqual(
+                model.feature_importances_.tolist(), feature_importances_report
+            )
+
     def test_classifier_attributes_monotable(self):
         """Test consistency of KhiopsClassifier's attributes with the output reports