@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
427427 and hasattr (self , "model_report_" )
428428 and isinstance (self .model_report_ , kh .KhiopsJSONObject )
429429 ):
430+ self .feature_names_in_ = ds .main_table .column_ids
430431 self ._fit_training_post_process (ds )
431432 self .is_multitable_model_ = ds .is_multitable
432433 self .n_features_in_ = ds .main_table .n_features ()
@@ -1597,6 +1598,36 @@ def __init__(
15971598 self .n_evaluated_features = n_evaluated_features
15981599 self .n_selected_features = n_selected_features
15991600
1601+ def _fit_training_post_process (self , ds ):
1602+ # Call the parent's method
1603+ super ()._fit_training_post_process (ds )
1604+
1605+ # Extract statistics, about the selected features, from the modeling report
1606+ modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
1607+ if modeling_report .selected_variables is not None :
1608+ feature_used_names_ , feature_used_importances_ = (
1609+ self .get_feature_used_statistics (modeling_report )
1610+ )
1611+ self .feature_used_names_ = feature_used_names_
1612+ self .feature_used_importances_ = feature_used_importances_
1613+ self .n_features_used_ = len (self .feature_used_names_ )
1614+
1615+ # feature_used_names_ is not set if no variable is selected in the model
1616+ feature_used_names = getattr (self , "feature_used_names_" , [])
1617+
1618+ # Compute feature importances
1619+ feature_importances = []
1620+ for feature_name in self .feature_names_in_ :
1621+ if feature_name in feature_used_names :
1622+ feature_index = np .where (feature_used_names == feature_name )
1623+ feature_importance = self .feature_used_importances_ [
1624+ feature_index
1625+ ].ravel ()[2 ]
1626+ else :
1627+ feature_importance = 0.0
1628+ feature_importances .append (feature_importance )
1629+ self .feature_importances_ = np .array (feature_importances )
1630+
16001631 def __sklearn_tags__ (self ):
16011632 # If we don't implement this trivial method it's not found by the sklearn. This
16021633 # is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,39 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
17951826 classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
17961827 The list of classes seen in training. Depending on the training target, the
17971828 contents are ``int`` or ``str``.
1829+ n_features_in_ : int
1830+ The number of features in the main table of the training dataset.
1831+ feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
1832+ Names of the features in the main table of the training dataset.
1833+ feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
1834+ Importances of the features provided to the classifier, in the main
1835+ table of the training dataset. The importance of each feature is
1836+ calculated as follows:
1837+
1838+ - if the feature is used by the classifier, then its importance is the
1839+ average of its exact Shapley values across the training dataset.
1840+
1841+ - if the feature is not used by the classifier, then its importance
1842+ is 0.0.
1843+
1844+ .. warning::
1845+ Since Khiops is an AutoML suite, it uses generated features on its
1846+ predictors (e.g. regularized decision trees). This implies that there
1847+ is no direct link between the native features and its importance
1848+ when AutoML features are used, as an important feature might not be
1849+ selected, but a generated feature might (e.g. a tree containing an
1850+ important variable).
1851+
1852+ To ensure that the ``feature_importances_`` attribute has
1853+ `the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
1854+ one must disable most AutoML capabilities of Khiops, namely:
1855+
1856+ - the training dataset must be monotable;
1857+ - no timestamp column should be used in the training dataset;
1858+ - the ``n_trees`` parameter must be set to 0;
1859+ - the ``n_pairs`` parameter must be left to its default value, 0;
1860+ - the ``n_text_features`` parameter must be set to 0.
1861+
17981862 n_features_evaluated_ : int
17991863 The number of features evaluated by the classifier.
18001864 feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1881,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
18171881 to all features selected by the classifier. It ranges between 0 (little
18181882 contribution to the model) and 1 (large contribution to the model).
18191883
1820- - Importance: The geometric mean between the Level and the Weight.
1884+ - Importance: Average of the exact Shapley values of each used feature
1885+ across the training data.
18211886
18221887 is_multitable_model_ : bool
18231888 ``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2094,6 @@ def _fit_training_post_process(self, ds):
20292094 if key .startswith ("TargetProb" ):
20302095 variable .used = True
20312096
2032- # Extract statistics, about the selected features, from the modeling report
2033- modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
2034- if modeling_report .selected_variables is not None :
2035- feature_used_names_ , feature_used_importances_ = (
2036- self .get_feature_used_statistics (modeling_report )
2037- )
2038- self .feature_used_names_ = feature_used_names_
2039- self .feature_used_importances_ = feature_used_importances_
2040- self .n_features_used_ = len (self .feature_used_names_ )
2041-
20422097 def predict (self , X ):
20432098 """Predicts the most probable class for the test dataset X
20442099
@@ -2208,6 +2263,37 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22082263
22092264 Attributes
22102265 ----------
2266+ n_features_in_ : int
2267+ The number of features in the main table of the training dataset.
2268+ feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
2269+ Names of the features in the main table of the training dataset.
2270+ feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
2271+ Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
2272+
2273+ - if the feature is used by the classifier, then its importance is the
2274+ average of its exact Shapley values across the training dataset.
2275+
2276+ - if the feature is not used by the classifier, then its importance
2277+ is 0.0.
2278+
2279+ .. warning::
2280+ Since Khiops is an AutoML suite, it uses generated features on its
2281+ predictors (e.g. regularized decision trees). This implies that there
2282+ is no direct link between the native features and its importance
2283+ when AutoML features are used, as an important feature might not be
2284+ selected, but a generated feature might (e.g. a tree containing an
2285+ important variable).
2286+
2287+ To ensure that the ``feature_importances_`` attribute has
2288+ `the meaning specified by scikit-learn <https://scikit-learn.org/stable/glossary.html#term-feature_importances_>`_
2289+ one must disable most AutoML capabilities of Khiops, namely:
2290+
2291+ - the training dataset must be monotable;
2292+ - no timestamp column should be used in the training dataset;
2293+ - the ``n_trees`` parameter must be set to 0;
2294+ - the ``n_pairs`` parameter must be left to its default value, 0;
2295+ - the ``n_text_features`` parameter must be set to 0.
2296+
22112297 n_features_evaluated_ : int
22122298 The number of features evaluated by the classifier.
22132299 feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2316,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22302316 to all features selected by the classifier. It ranges between 0 (little
22312317 contribution to the model) and 1 (large contribution to the model).
22322318
2233- - Importance: The geometric mean between the Level and the Weight.
2319+ - Importance: Average of the exact Shapley values of each used feature
2320+ across the training data.
22342321
22352322 is_multitable_model_ : bool
22362323 ``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2422,6 @@ def _fit_training_post_process(self, ds):
23352422 for variable_name in variables_to_eliminate :
23362423 self ._get_main_dictionary ().remove_variable (variable_name )
23372424
2338- # Extract statistics, about the selected features, from the modeling report
2339- modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
2340- if modeling_report .selected_variables is not None :
2341- feature_used_names_ , feature_used_importances_ = (
2342- self .get_feature_used_statistics (modeling_report )
2343- )
2344- self .feature_used_names_ = feature_used_names_
2345- self .feature_used_importances_ = feature_used_importances_
2346- self .n_features_used_ = len (self .feature_used_names_ )
2347-
23482425 def _check_target_type (self , ds ):
23492426 _check_numerical_target_type (ds )
23502427
0 commit comments