@@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs):
427427 and hasattr (self , "model_report_" )
428428 and isinstance (self .model_report_ , kh .KhiopsJSONObject )
429429 ):
430+ self .feature_names_in_ = ds .main_table .column_ids
430431 self ._fit_training_post_process (ds )
431432 self .is_multitable_model_ = ds .is_multitable
432433 self .n_features_in_ = ds .main_table .n_features ()
@@ -1597,6 +1598,36 @@ def __init__(
15971598 self .n_evaluated_features = n_evaluated_features
15981599 self .n_selected_features = n_selected_features
15991600
1601+ def _fit_training_post_process (self , ds ):
1602+ # Call the parent's method
1603+ super ()._fit_training_post_process (ds )
1604+
1605+ # Extract statistics, about the selected features, from the modeling report
1606+ modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
1607+ if modeling_report .selected_variables is not None :
1608+ feature_used_names_ , feature_used_importances_ = (
1609+ self .get_feature_used_statistics (modeling_report )
1610+ )
1611+ self .feature_used_names_ = feature_used_names_
1612+ self .feature_used_importances_ = feature_used_importances_
1613+ self .n_features_used_ = len (self .feature_used_names_ )
1614+
1615+ # Compute feature importances
1616+ feature_importances = []
1617+
1618+ # feature_used_names_ is not set if no variable is selected in the model
1619+ feature_used_names = getattr (self , "feature_used_names_" , [])
1620+ for feature_name in self .feature_names_in_ :
1621+ if feature_name in feature_used_names :
1622+ feature_index = np .where (feature_used_names == feature_name )
1623+ feature_importance = self .feature_used_importances_ [
1624+ feature_index
1625+ ].ravel ()[2 ]
1626+ else :
1627+ feature_importance = 0.0
1628+ feature_importances .append (feature_importance )
1629+ self .feature_importances_ = np .array (feature_importances )
1630+
16001631 def __sklearn_tags__ (self ):
16011632 # If we don't implement this trivial method it's not found by the sklearn. This
16021633 # is likely due to the complex resolution of the multiple inheritance.
@@ -1795,6 +1826,30 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
17951826 classes_ : `ndarray <numpy.ndarray>` of shape (n_classes\_,)
17961827 The list of classes seen in training. Depending on the training target, the
17971828 contents are ``int`` or ``str``.
1829+ n_features_in_ : int
1830+ The number of features in the main table of the training dataset.
1831+ feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
1832+ Names of the features in the main table of the training dataset.
1833+ feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
1834+ Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
1835+
1836+ - if the feature is used by the classifier, then its importance is equal
1837+ to the average of its exact Shapley values across the training data.
1838+
1839+ - if the feature is not used by the classifier, then its importance
1840+ equals 0.0.
1841+
1842+ .. note::
1843+ In order to maximize the accuracy of the feature importances, the
1844+ estimator must be trained on monotable data, must not use trees,
1845+ feature pairs, text features or timestamps. More precisely:
1846+
1847+ - the training dataset must be monotable;
1848+ - no timestamp column should be used in the training dataset;
1849+ - the `n_trees` parameter must be set to 0;
1850+ - the `n_pairs` parameter must be left to its default value, 0;
1851+ - the `n_text_features` parameter must be set to 0.
1852+
17981853 n_features_evaluated_ : int
17991854 The number of features evaluated by the classifier.
18001855 feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -1817,7 +1872,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
18171872 to all features selected by the classifier. It ranges between 0 (little
18181873 contribution to the model) and 1 (large contribution to the model).
18191874
1820- - Importance: The geometric mean between the Level and the Weight.
1875+ - Importance: Average of the exact Shapley values of each used feature
1876+ across the training data.
18211877
18221878 is_multitable_model_ : bool
18231879 ``True`` if the model was fitted on a multi-table dataset.
@@ -2029,16 +2085,6 @@ def _fit_training_post_process(self, ds):
20292085 if key .startswith ("TargetProb" ):
20302086 variable .used = True
20312087
2032- # Extract statistics, about the selected features, from the modeling report
2033- modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
2034- if modeling_report .selected_variables is not None :
2035- feature_used_names_ , feature_used_importances_ = (
2036- self .get_feature_used_statistics (modeling_report )
2037- )
2038- self .feature_used_names_ = feature_used_names_
2039- self .feature_used_importances_ = feature_used_importances_
2040- self .n_features_used_ = len (self .feature_used_names_ )
2041-
20422088 def predict (self , X ):
20432089 """Predicts the most probable class for the test dataset X
20442090
@@ -2208,6 +2254,30 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22082254
22092255 Attributes
22102256 ----------
2257+ n_features_in_ : int
2258+ The number of features in the main table of the training dataset.
2259+ feature_names_in_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_,)
2260+ Names of the features in the main table of the training dataset.
2261+ feature_importances_ : `ndarray <numpy.ndarray>` of shape (n_features_in\_, )
2262+ Importances of the features provided to the classifier. The importance of each feature is calculated as follows:
2263+
2264+ - if the feature is used by the classifier, then its importance is equal
2265+ to the average of its exact Shapley values across the training data.
2266+
2267+ - if the feature is not used by the classifier, then its importance
2268+ equals 0.0.
2269+
2270+ .. note::
2271+ In order to maximize the accuracy of the feature importances, the
2272+ estimator must be trained on monotable data, must not use trees,
2273+ feature pairs, text features or timestamps. More precisely:
2274+
2275+ - the training dataset must be monotable;
2276+ - no timestamp column should be used in the training dataset;
2277+ - the `n_trees` parameter must be set to 0;
2278+ - the `n_pairs` parameter must be left to its default value, 0;
2279+ - the `n_text_features` parameter must be set to 0.
2280+
22112281 n_features_evaluated_ : int
22122282 The number of features evaluated by the classifier.
22132283 feature_evaluated_names_ : `ndarray <numpy.ndarray>` of shape (n_features_evaluated\_,)
@@ -2230,7 +2300,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor):
22302300 to all features selected by the classifier. It ranges between 0 (little
22312301 contribution to the model) and 1 (large contribution to the model).
22322302
2233- - Importance: The geometric mean between the Level and the Weight.
2303+ - Importance: Average of the exact Shapley values of each used feature
2304+ across the training data.
22342305
22352306 is_multitable_model_ : bool
22362307 ``True`` if the model was fitted on a multi-table dataset.
@@ -2335,16 +2406,6 @@ def _fit_training_post_process(self, ds):
23352406 for variable_name in variables_to_eliminate :
23362407 self ._get_main_dictionary ().remove_variable (variable_name )
23372408
2338- # Extract statistics, about the selected features, from the modeling report
2339- modeling_report = self .model_report_ .modeling_report .get_snb_predictor ()
2340- if modeling_report .selected_variables is not None :
2341- feature_used_names_ , feature_used_importances_ = (
2342- self .get_feature_used_statistics (modeling_report )
2343- )
2344- self .feature_used_names_ = feature_used_names_
2345- self .feature_used_importances_ = feature_used_importances_
2346- self .n_features_used_ = len (self .feature_used_names_ )
2347-
23482409 def _check_target_type (self , ds ):
23492410 _check_numerical_target_type (ds )
23502411
0 commit comments