Add sklearn estimators feature_importances_ attribute

popescu-v · popescu-v · commit e5dbf4c4752e · 2025-10-02T18:24:57.000+02:00
TODO:
- remove feature_evaluated_names_, n_features_evaluated_ and
  feature_evaluated_importances_
- drop level and weight from feature_used_importances_
- accordingly, update tests and sklearn samples to reflect these changes.

The rationale of these changes is that the evaluated features, as well as the
level and weight of the used features characterize the analysis process,
not the model itself. Hence, they should not be stored on the estimator,
which represents the model rather than the analysis process.
diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -76,6 +76,15 @@ Samples
         print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
     print("---")
 
+    print("Top 5 used features, among those present in the dataset")
+    for feature, importance in sorted(
+        zip(khc.feature_names_in_, khc.feature_importances_),
+        key=lambda feature_importance: float(feature_importance[1]),
+        reverse=True,
+    )[:5]:
+        print(f"{feature} - Importance: {importance}")
+    print("---")
+
     # Predict the classes on the test dataset
     y_test_pred = khc.predict(X_test)
     print("Predicted classes (first 10):")
diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -62,6 +62,15 @@
     "    print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
     "print(\"---\")\n",
     "\n",
+    "print(\"Top 5 used features, among those present in the dataset\")\n",
+    "for feature, importance in sorted(\n",
+    "    zip(khc.feature_names_in_, khc.feature_importances_),\n",
+    "    key=lambda feature_importance: float(feature_importance[1]),\n",
+    "    reverse=True,\n",
+    ")[:5]:\n",
+    "    print(f\"{feature} - Importance: {importance}\")\n",
+    "print(\"---\")\n",
+    "\n",
     "# Predict the classes on the test dataset\n",
     "y_test_pred = khc.predict(X_test)\n",
     "print(\"Predicted classes (first 10):\")\n",
diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py
@@ -65,6 +65,15 @@ def khiops_classifier():
         print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
     print("---")
 
+    print("Top 5 used features, among those present in the dataset")
+    for feature, importance in sorted(
+        zip(khc.feature_names_in_, khc.feature_importances_),
+        key=lambda feature_importance: float(feature_importance[1]),
+        reverse=True,
+    )[:5]:
+        print(f"{feature} - Importance: {importance}")
+    print("---")
+
     # Predict the classes on the test dataset
     y_test_pred = khc.predict(X_test)
     print("Predicted classes (first 10):")
diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
             and hasattr(self, "model_report_")
             and isinstance(self.model_report_, kh.KhiopsJSONObject)
         ):
+            self.feature_names_in_ = ds.main_table.column_ids
             self._fit_training_post_process(ds)
             self.is_multitable_model_ = ds.is_multitable
             self.n_features_in_ = ds.main_table.n_features()
@@ -1597,6 +1598,36 @@ def __init__(
         self.n_evaluated_features = n_evaluated_features
         self.n_selected_features = n_selected_features
 
+    def _fit_training_post_process(self, ds):
+        # Call the parent's method
+        super()._fit_training_post_process(ds)
+
+        # Extract statistics, about the selected features, from the modeling report
+        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
+        if modeling_report.selected_variables is not None:
+            feature_used_names_, feature_used_importances_ = (
+                self.get_feature_used_statistics(modeling_report)
+            )
+            self.feature_used_names_ = feature_used_names_
+            self.feature_used_importances_ = feature_used_importances_
+            self.n_features_used_ = len(self.feature_used_names_)
+
+        # Compute feature importances
+        feature_importances = []
+
+        # feature_used_names_ is not set if no variable is selected in the model
+        feature_used_names = getattr(self, "feature_used_names_", [])
+        for feature_name in self.feature_names_in_:
+            if feature_name in feature_used_names:
+                feature_index = np.where(feature_used_names == feature_name)
+                feature_importance = self.feature_used_importances_[
+                    feature_index
+                ].ravel()[2]
+            else:
+                feature_importance = 0.0
+            feature_importances.append(feature_importance)
+            self.feature_importances_ = np.array(feature_importances)
+
     def __sklearn_tags__(self):
         # If we don't implement this trivial method it's not found by the sklearn. This
         # is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,30 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
     classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
         The list of classes seen in training. Depending on the training target, the
         contents are ``int`` or ``str``.
+    n_features_in_ : int
+        The number of features in the main table of the training dataset.
+    feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
+        Names of the features in the main table of the training dataset.
+    feature_importances_ :  `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
+        Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
+
+        - if the feature is used by the classifier, then its importance is equal
+          to the average of its exact Shapley values across the training data.
+
+        - if the feature is not used by the classifier, then its importance
+          equals 0.0.
+
+        .. note::
+             In order to maximize the accuracy of the feature importances, the
+             estimator must be trained on monotable data, must not use trees,
+             feature pairs, text features or timestamps. More precisely:
+
+             - the training dataset must be monotable;
+             - no timestamp column should be used in the training dataset;
+             - the `n_trees` parameter must be set to 0;
+             - the `n_pairs` parameter must be left to its default value, 0;
+             - the `n_text_features` parameter must be set to 0.
+
     n_features_evaluated_ : int
         The number of features evaluated by the classifier.
     feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1872,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
           to all features selected by the classifier. It ranges between 0 (little
           contribution to the model) and 1 (large contribution to the model).
 
-        - Importance: The geometric mean between the Level and the Weight.
+        - Importance: Average of the exact Shapley values of each used feature
+          across the training data.
 
     is_multitable_model_ : bool
         ``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2085,6 @@ def _fit_training_post_process(self, ds):
                 if key.startswith("TargetProb"):
                     variable.used = True
 
-        # Extract statistics, about the selected features, from the modeling report
-        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
-        if modeling_report.selected_variables is not None:
-            feature_used_names_, feature_used_importances_ = (
-                self.get_feature_used_statistics(modeling_report)
-            )
-            self.feature_used_names_ = feature_used_names_
-            self.feature_used_importances_ = feature_used_importances_
-            self.n_features_used_ = len(self.feature_used_names_)
-
     def predict(self, X):
         """Predicts the most probable class for the test dataset X
 
@@ -2208,6 +2254,30 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
 
     Attributes
     ----------
+    n_features_in_ : int
+        The number of features in the main table of the training dataset.
+    feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
+        Names of the features in the main table of the training dataset.
+    feature_importances_ :  `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
+        Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
+
+        - if the feature is used by the classifier, then its importance is equal
+          to the average of its exact Shapley values across the training data.
+
+        - if the feature is not used by the classifier, then its importance
+          equals 0.0.
+
+        .. note::
+             In order to maximize the accuracy of the feature importances, the
+             estimator must be trained on monotable data, must not use trees,
+             feature pairs, text features or timestamps. More precisely:
+
+             - the training dataset must be monotable;
+             - no timestamp column should be used in the training dataset;
+             - the `n_trees` parameter must be set to 0;
+             - the `n_pairs` parameter must be left to its default value, 0;
+             - the `n_text_features` parameter must be set to 0.
+
     n_features_evaluated_ : int
         The number of features evaluated by the classifier.
     feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2300,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
           to all features selected by the classifier. It ranges between 0 (little
           contribution to the model) and 1 (large contribution to the model).
 
-        - Importance: The geometric mean between the Level and the Weight.
+        - Importance: Average of the exact Shapley values of each used feature
+          across the training data.
 
     is_multitable_model_ : bool
         ``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2406,6 @@ def _fit_training_post_process(self, ds):
         for variable_name in variables_to_eliminate:
             self._get_main_dictionary().remove_variable(variable_name)
 
-        # Extract statistics, about the selected features, from the modeling report
-        modeling_report = self.model_report_.modeling_report.get_snb_predictor()
-        if modeling_report.selected_variables is not None:
-            feature_used_names_, feature_used_importances_ = (
-                self.get_feature_used_statistics(modeling_report)
-            )
-            self.feature_used_names_ = feature_used_names_
-            self.feature_used_importances_ = feature_used_importances_
-            self.n_features_used_ = len(self.feature_used_names_)
-
     def _check_target_type(self, ds):
         _check_numerical_target_type(ds)
 
diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py
@@ -13,6 +13,7 @@
 import pandas as pd
 
 from khiops import core as kh
+from khiops.sklearn.dataset import Dataset
 from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor
 
 # Disable PEP8 variable names because of scikit-learn X,y conventions
@@ -74,6 +75,7 @@ def assert_attribute_values_ok(self, model, X, y):
             self.assertEqual(model.classes_.tolist(), sorted(y.unique()))
             self.assertEqual(model.n_classes_, len(y.unique()))
             self.assertEqual(model.n_features_in_, len(X.columns))
+            self.assertEqual(model.feature_names_in_.tolist(), X.columns.tolist())
 
         # Extract the features and their levels from the report
         # TODO: Eliminate this as this is the implementation
@@ -162,6 +164,25 @@ def assert_attribute_values_ok(self, model, X, y):
                 model.n_features_used_, len(feature_used_importances_report)
             )
 
+            ds = Dataset(X)
+            feature_names_in_dataset = ds.main_table.column_ids
+            self.assertEqual(
+                model.feature_names_in_.tolist(), feature_names_in_dataset.tolist()
+            )
+
+            feature_importances_report = []
+            for feature_name in feature_names_in_dataset:
+                if feature_name in feature_used_names:
+                    feature_index = feature_used_names.index(feature_name)
+                    feature_importances_report.append(
+                        feature_used_importances_report[feature_index][2]
+                    )
+                else:
+                    feature_importances_report.append(0.0)
+            self.assertEqual(
+                model.feature_importances_.tolist(), feature_importances_report
+            )
+
     def test_classifier_attributes_monotable(self):
         """Test consistency of KhiopsClassifier's attributes with the output reports