From ffbd76ec9569b01a4ebdc20f74ff3755b4051306 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Thu, 2 Oct 2025 18:18:31 +0200 Subject: [PATCH 1/9] Silence Pandas deprecation warning --- khiops/sklearn/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 03603917..989f45a7 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -614,9 +614,9 @@ def _init_target_column(self, y): else: y_checked = _column_or_1d_with_dtype(y, dtype=y.dtype) elif hasattr(y, "dtypes"): - if isinstance(y.dtypes[0], pd.CategoricalDtype): + if isinstance(y.dtypes.iloc[0], pd.CategoricalDtype): y_checked = _column_or_1d_with_dtype( - y, dtype=y.dtypes[0].categories.dtype + y, dtype=y.dtypes.iloc[0].categories.dtype ) else: y_checked = _column_or_1d_with_dtype(y) From b52d4e8da377e4ab8d340909f555ade1e2cd1e72 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Wed, 1 Oct 2025 19:08:11 +0200 Subject: [PATCH 2/9] Add sklearn estimators feature_importances_ attribute This attribute quantifies the importance of each of the _input_ features, in their order of occurrence in the input dataset: - if the feature is used, then its importance is retrieved from the report (as the average of its exact Shapley values across the training dataset) - else, the importance is set to 0.0. --- doc/samples/samples_sklearn.rst | 9 ++ khiops/samples/samples_sklearn.ipynb | 9 ++ khiops/samples/samples_sklearn.py | 9 ++ khiops/sklearn/estimators.py | 121 ++++++++++++++++++++++----- tests/test_estimator_attributes.py | 21 +++++ 5 files changed, 147 insertions(+), 22 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index ce8611d9..24be773f 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -76,6 +76,15 @@ Samples print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") print("---") + print("Top 5 used features, among those present in the dataset") + for feature, importance in sorted( + zip(khc.feature_names_in_, khc.feature_importances_), + key=lambda imp: float(imp[1]), + reverse=True, + )[:5]: + print(f"{feature} - Importance: {importance}") + print("---") + # Predict the classes on the test dataset y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 6fa1fce2..ce66e18e 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -62,6 +62,15 @@ " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", "print(\"---\")\n", "\n", + "print(\"Top 5 used features, among those present in the dataset\")\n", + "for feature, importance in sorted(\n", + " zip(khc.feature_names_in_, khc.feature_importances_),\n", + " key=lambda imp: float(imp[1]),\n", + " reverse=True,\n", + ")[:5]:\n", + " print(f\"{feature} - Importance: {importance}\")\n", + "print(\"---\")\n", + "\n", "# Predict the classes on the test dataset\n", "y_test_pred = khc.predict(X_test)\n", "print(\"Predicted classes (first 10):\")\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 3ba1d320..272701c5 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -65,6 +65,15 @@ def khiops_classifier(): print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") print("---") + print("Top 5 used features, among those present in the dataset") + for feature, importance in sorted( + zip(khc.feature_names_in_, khc.feature_importances_), + key=lambda imp: float(imp[1]), + reverse=True, + )[:5]: + print(f"{feature} - Importance: {importance}") + print("---") + # Predict the classes on the test dataset y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3cfcf13a..422ccd2d 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -427,6 +427,7 @@ def _fit(self, ds, computation_dir, **kwargs): and hasattr(self, "model_report_") and isinstance(self.model_report_, kh.KhiopsJSONObject) ): + self.feature_names_in_ = ds.main_table.column_ids self._fit_training_post_process(ds) self.is_multitable_model_ = ds.is_multitable self.n_features_in_ = ds.main_table.n_features() @@ -1597,6 +1598,36 @@ def __init__( self.n_evaluated_features = n_evaluated_features self.n_selected_features = n_selected_features + def _fit_training_post_process(self, ds): + # Call the parent's method + super()._fit_training_post_process(ds) + + # Extract statistics, about the selected features, from the modeling report + modeling_report = self.model_report_.modeling_report.get_snb_predictor() + if modeling_report.selected_variables is not None: + feature_used_names_, feature_used_importances_ = ( + self.get_feature_used_statistics(modeling_report) + ) + self.feature_used_names_ = feature_used_names_ + self.feature_used_importances_ = feature_used_importances_ + self.n_features_used_ = len(self.feature_used_names_) + + # feature_used_names_ is not set if no variable is selected in the model + feature_used_names = getattr(self, "feature_used_names_", []) + + # Compute feature importances + feature_importances = [] + for feature_name in self.feature_names_in_: + if feature_name in feature_used_names: + feature_index = np.where(feature_used_names == feature_name) + feature_importance = self.feature_used_importances_[ + feature_index + ].ravel()[2] + else: + feature_importance = 0.0 + feature_importances.append(feature_importance) + self.feature_importances_ = np.array(feature_importances) + def __sklearn_tags__(self): # If we don't implement this trivial method it's not found by the sklearn. This # is likely due to the complex resolution of the multiple inheritance. @@ -1795,6 +1826,39 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): classes_ : `ndarray ` of shape (n_classes\_,) The list of classes seen in training. Depending on the training target, the contents are ``int`` or ``str``. + n_features_in_ : int + The number of features in the main table of the training dataset. + feature_names_in_ : `ndarray ` of shape (n_features_in\_,) + Names of the features in the main table of the training dataset. + feature_importances_ : `ndarray ` of shape (n_features_in\_, ) + Importances of the features provided to the classifier, in the main + table of the training dataset. The importance of each feature is + calculated as follows: + + - if the feature is used by the classifier, then its importance is the + average of its exact Shapley values across the training dataset. + + - if the feature is not used by the classifier, then its importance + is 0.0. + + .. warning:: + Since Khiops is an AutoML suite, it uses generated features on its + predictors (e.g. regularized decision trees). This implies that there + is no direct link between the native features and its importance + when AutoML features are used, as an important feature might not be + selected, but a generated feature might (e.g. a tree containing an + important variable). + + To ensure that the ``feature_importances_`` attribute has + `the meaning specified by scikit-learn `_ + one must disable most AutoML capabilities of Khiops, namely: + + - the training dataset must be monotable; + - no timestamp column should be used in the training dataset; + - the ``n_trees`` parameter must be set to 0; + - the ``n_pairs`` parameter must be left to its default value, 0; + - the ``n_text_features`` parameter must be set to 0. + n_features_evaluated_ : int The number of features evaluated by the classifier. feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) @@ -1817,7 +1881,8 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): to all features selected by the classifier. It ranges between 0 (little contribution to the model) and 1 (large contribution to the model). - - Importance: The geometric mean between the Level and the Weight. + - Importance: Average of the exact Shapley values of each used feature + across the training data. is_multitable_model_ : bool ``True`` if the model was fitted on a multi-table dataset. @@ -2029,16 +2094,6 @@ def _fit_training_post_process(self, ds): if key.startswith("TargetProb"): variable.used = True - # Extract statistics, about the selected features, from the modeling report - modeling_report = self.model_report_.modeling_report.get_snb_predictor() - if modeling_report.selected_variables is not None: - feature_used_names_, feature_used_importances_ = ( - self.get_feature_used_statistics(modeling_report) - ) - self.feature_used_names_ = feature_used_names_ - self.feature_used_importances_ = feature_used_importances_ - self.n_features_used_ = len(self.feature_used_names_) - def predict(self, X): """Predicts the most probable class for the test dataset X @@ -2208,6 +2263,37 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): Attributes ---------- + n_features_in_ : int + The number of features in the main table of the training dataset. + feature_names_in_ : `ndarray ` of shape (n_features_in\_,) + Names of the features in the main table of the training dataset. + feature_importances_ : `ndarray ` of shape (n_features_in\_, ) + Importances of the features provided to the classifier. The importance of each feature is calculated as follows: + + - if the feature is used by the classifier, then its importance is the + average of its exact Shapley values across the training dataset. + + - if the feature is not used by the classifier, then its importance + is 0.0. + + .. warning:: + Since Khiops is an AutoML suite, it uses generated features on its + predictors (e.g. regularized decision trees). This implies that there + is no direct link between the native features and its importance + when AutoML features are used, as an important feature might not be + selected, but a generated feature might (e.g. a tree containing an + important variable). + + To ensure that the ``feature_importances_`` attribute has + `the meaning specified by scikit-learn `_ + one must disable most AutoML capabilities of Khiops, namely: + + - the training dataset must be monotable; + - no timestamp column should be used in the training dataset; + - the ``n_trees`` parameter must be set to 0; + - the ``n_pairs`` parameter must be left to its default value, 0; + - the ``n_text_features`` parameter must be set to 0. + n_features_evaluated_ : int The number of features evaluated by the classifier. feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) @@ -2230,7 +2316,8 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): to all features selected by the classifier. It ranges between 0 (little contribution to the model) and 1 (large contribution to the model). - - Importance: The geometric mean between the Level and the Weight. + - Importance: Average of the exact Shapley values of each used feature + across the training data. is_multitable_model_ : bool ``True`` if the model was fitted on a multi-table dataset. @@ -2335,16 +2422,6 @@ def _fit_training_post_process(self, ds): for variable_name in variables_to_eliminate: self._get_main_dictionary().remove_variable(variable_name) - # Extract statistics, about the selected features, from the modeling report - modeling_report = self.model_report_.modeling_report.get_snb_predictor() - if modeling_report.selected_variables is not None: - feature_used_names_, feature_used_importances_ = ( - self.get_feature_used_statistics(modeling_report) - ) - self.feature_used_names_ = feature_used_names_ - self.feature_used_importances_ = feature_used_importances_ - self.n_features_used_ = len(self.feature_used_names_) - def _check_target_type(self, ds): _check_numerical_target_type(ds) diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index ac4a46c6..73434e45 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -13,6 +13,7 @@ import pandas as pd from khiops import core as kh +from khiops.sklearn.dataset import Dataset from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor # Disable PEP8 variable names because of scikit-learn X,y conventions @@ -74,6 +75,7 @@ def assert_attribute_values_ok(self, model, X, y): self.assertEqual(model.classes_.tolist(), sorted(y.unique())) self.assertEqual(model.n_classes_, len(y.unique())) self.assertEqual(model.n_features_in_, len(X.columns)) + self.assertEqual(model.feature_names_in_.tolist(), X.columns.tolist()) # Extract the features and their levels from the report # TODO: Eliminate this as this is the implementation @@ -162,6 +164,25 @@ def assert_attribute_values_ok(self, model, X, y): model.n_features_used_, len(feature_used_importances_report) ) + # Test input feature names and importances + ds = Dataset(X) + feature_names_in_dataset = ds.main_table.column_ids + self.assertEqual( + model.feature_names_in_.tolist(), feature_names_in_dataset.tolist() + ) + feature_importances_report = [] + for feature_name in feature_names_in_dataset: + if feature_name in feature_used_names: + feature_index = feature_used_names.index(feature_name) + feature_importances_report.append( + feature_used_importances_report[feature_index][2] + ) + else: + feature_importances_report.append(0.0) + self.assertEqual( + model.feature_importances_.tolist(), feature_importances_report + ) + def test_classifier_attributes_monotable(self): """Test consistency of KhiopsClassifier's attributes with the output reports From 4ef9b569b969a11f5c28d451350db67f5433729e Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:08:52 +0200 Subject: [PATCH 3/9] Drop feature_evaluated_* and n_features_evaluated_ from sklearn estimators Indeed, these characterize the analysis process, not the resulting model itself. --- doc/samples/samples_sklearn.rst | 11 ---- khiops/samples/samples_sklearn.ipynb | 11 ---- khiops/samples/samples_sklearn.py | 11 ---- khiops/sklearn/estimators.py | 75 ---------------------------- tests/test_estimator_attributes.py | 62 ----------------------- 5 files changed, 170 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 24be773f..c6e364b6 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -69,7 +69,6 @@ Samples khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -196,7 +195,6 @@ Samples khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -317,7 +315,6 @@ Samples khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -549,7 +546,6 @@ Samples khr.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khr.n_features_evaluated_}") print(f"Features selected : {khr.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khr.feature_used_names_[:3]): @@ -676,13 +672,6 @@ Samples khe = KhiopsEncoder(n_features=10) khe.fit(X, y) - # Show the feature importance info - print(f"Features evaluated: {khe.n_features_evaluated_}") - print("Top 3 evaluated features") - for i, feature in enumerate(khe.feature_evaluated_names_[:3]): - print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}") - print("---") - # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index ce66e18e..66cdac58 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -55,7 +55,6 @@ "khc.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", - "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", @@ -208,7 +207,6 @@ "khc.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", - "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", @@ -355,7 +353,6 @@ "khc.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", - "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", @@ -639,7 +636,6 @@ "khr.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", - "print(f\"Features evaluated: {khr.n_features_evaluated_}\")\n", "print(f\"Features selected : {khr.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khr.feature_used_names_[:3]):\n", @@ -805,13 +801,6 @@ "khe = KhiopsEncoder(n_features=10)\n", "khe.fit(X, y)\n", "\n", - "# Show the feature importance info\n", - "print(f\"Features evaluated: {khe.n_features_evaluated_}\")\n", - "print(\"Top 3 evaluated features\")\n", - "for i, feature in enumerate(khe.feature_evaluated_names_[:3]):\n", - " print(f\"{feature} - Level: {khe.feature_evaluated_importances_[i]}\")\n", - "print(\"---\")\n", - "\n", "# Transform the train dataset\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 272701c5..e5dbdd67 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -58,7 +58,6 @@ def khiops_classifier(): khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -191,7 +190,6 @@ def khiops_classifier_text(): khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -320,7 +318,6 @@ def khiops_classifier_multitable_snowflake(): khc.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khc.n_features_evaluated_}") print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): @@ -560,7 +557,6 @@ def khiops_regressor(): khr.fit(X_train, y_train) # Show the feature importance info - print(f"Features evaluated: {khr.n_features_evaluated_}") print(f"Features selected : {khr.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khr.feature_used_names_[:3]): @@ -705,13 +701,6 @@ def khiops_encoder_multitable_snowflake(): khe = KhiopsEncoder(n_features=10) khe.fit(X, y) - # Show the feature importance info - print(f"Features evaluated: {khe.n_features_evaluated_}") - print("Top 3 evaluated features") - for i, feature in enumerate(khe.feature_evaluated_names_[:3]): - print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}") - print("---") - # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 422ccd2d..160004e6 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1449,59 +1449,6 @@ def _fit_training_post_process(self, ds): if self.model_main_dictionary_name_ is None: raise ValueError("No model dictionary after Khiops call") - # Extract, from the preparation reports, the number of evaluated features, - # their names and their levels - univariate_preparation_report = self.model_report_.preparation_report - if self.model_report_.bivariate_preparation_report is not None: - bivariate_preparation_report = ( - self.model_report_.bivariate_preparation_report - ) - pair_feature_evaluated_names_ = ( - bivariate_preparation_report.get_variable_pair_names() - ) - pair_feature_evaluated_levels_ = [ - bivariate_preparation_report.get_variable_pair_statistics(*var).level - for var in bivariate_preparation_report.get_variable_pair_names() - ] - else: - pair_feature_evaluated_names_ = [] - pair_feature_evaluated_levels_ = [] - if self.model_report_.tree_preparation_report is not None: - tree_preparation_report = self.model_report_.tree_preparation_report - tree_feature_evaluated_names_ = tree_preparation_report.get_variable_names() - tree_feature_evaluated_levels_ = [ - tree_preparation_report.get_variable_statistics(var).level - for var in tree_preparation_report.get_variable_names() - ] - else: - tree_feature_evaluated_names_ = [] - tree_feature_evaluated_levels_ = [] - - feature_evaluated_names_ = ( - univariate_preparation_report.get_variable_names() - + pair_feature_evaluated_names_ - + tree_feature_evaluated_names_ - ) - feature_evaluated_importances_ = np.array( - [ - univariate_preparation_report.get_variable_statistics(var).level - for var in univariate_preparation_report.get_variable_names() - ] - + pair_feature_evaluated_levels_ - + tree_feature_evaluated_levels_ - ) - - # Sort the features by level - combined = list(zip(feature_evaluated_names_, feature_evaluated_importances_)) - combined.sort(key=lambda x: x[1], reverse=True) - - # Set the sklearn attributes - self.feature_evaluated_names_ = np.array( - [x[0] for x in combined], dtype=np.dtype("object") - ) - self.feature_evaluated_importances_ = np.array([x[1] for x in combined]) - self.n_features_evaluated_ = len(combined) - def _transform_check_dataset(self, ds): assert isinstance(ds, Dataset), "'ds' is not 'Dataset'" @@ -1859,13 +1806,6 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): - the ``n_pairs`` parameter must be left to its default value, 0; - the ``n_text_features`` parameter must be set to 0. - n_features_evaluated_ : int - The number of features evaluated by the classifier. - feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) - Names of the features evaluated by the classifier. - feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) - Level of the features evaluated by the classifier. - See below for a definition of the level. n_features_used_ : int The number of features used by the classifier. feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) @@ -2294,13 +2234,6 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): - the ``n_pairs`` parameter must be left to its default value, 0; - the ``n_text_features`` parameter must be set to 0. - n_features_evaluated_ : int - The number of features evaluated by the classifier. - feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) - Names of the features evaluated by the classifier. - feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) - Level of the features evaluated by the classifier. - See below for a definition of the level. n_features_used_ : int The number of features used by the classifier. feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) @@ -2561,14 +2494,6 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): Attributes ---------- - n_features_evaluated_ : int - The number of features evaluated by the classifier. - feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) - Names of the features evaluated by the classifier. - feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) - Level of the features evaluated by the classifier. The Level is measure of the - predictive importance of the feature taken individually. It ranges between 0 (no - predictive interest) and 1 (optimal predictive importance). is_multitable_model_ : bool ``True`` if the model was fitted on a multi-table dataset. model_ : `.DictionaryDomain` diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index 73434e45..3c42129b 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -79,68 +79,6 @@ def assert_attribute_values_ok(self, model, X, y): # Extract the features and their levels from the report # TODO: Eliminate this as this is the implementation - # Think of a better lighter test: For example verify that the variable are - # in order within the 3 feature lists (simple, pairs and trees). - # Do similarly below with the selected variables. - univariate_preparation_report = model.model_report_.preparation_report - if model.model_report_.bivariate_preparation_report is not None: - bivariate_preparation_report = ( - model.model_report_.bivariate_preparation_report - ) - pair_feature_evaluated_names_ = ( - bivariate_preparation_report.get_variable_pair_names() - ) - pair_feature_evaluated_levels_ = [ - [ - bivariate_preparation_report.get_variable_pair_statistics( - var[0], var[1] - ).level - ] - for var in bivariate_preparation_report.get_variable_pair_names() - ] - else: - pair_feature_evaluated_names_ = [] - pair_feature_evaluated_levels_ = [] - if model.model_report_.tree_preparation_report is not None: - tree_preparation_report = model.model_report_.tree_preparation_report - tree_feature_evaluated_names_ = tree_preparation_report.get_variable_names() - tree_feature_evaluated_levels_ = [ - [tree_preparation_report.get_variable_statistics(var).level] - for var in tree_preparation_report.get_variable_names() - ] - else: - tree_feature_evaluated_names_ = [] - tree_feature_evaluated_levels_ = [] - - feature_evaluated_names_report_ = ( - univariate_preparation_report.get_variable_names() - + pair_feature_evaluated_names_ - + tree_feature_evaluated_names_ - ) - feature_evaluated_importances_report = np.array( - [ - [univariate_preparation_report.get_variable_statistics(var).level] - for var in univariate_preparation_report.get_variable_names() - ] - + pair_feature_evaluated_levels_ - + tree_feature_evaluated_levels_ - ) - - # Sort the features by level - combined = list( - zip(feature_evaluated_names_report_, feature_evaluated_importances_report) - ) - combined.sort(key=lambda x: x[1], reverse=True) - feature_names = list(x[0] for x in combined) - feature_levels = list(x[1] for x in combined) - - # Check that the features and their levels were extracted in order - self.assertEqual( - model.n_features_evaluated_, len(feature_evaluated_names_report_) - ) - self.assertEqual(model.feature_evaluated_names_.tolist(), list(feature_names)) - self.assertEqual(model.feature_evaluated_importances_.tolist(), feature_levels) - modeling_report = model.model_report_.modeling_report # Check the selected variables for the regressor and classifier if not isinstance(model, KhiopsEncoder): From 9512cb3c8559308ec5e6dca630fb5836f3485107 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:47:05 +0200 Subject: [PATCH 4/9] Drop level and weight from the .feature_used_importances_ estimator attribute The level and weight of the features characterize the analysis process, not the resulting model itself. --- doc/samples/samples_sklearn.rst | 8 ++--- khiops/samples/samples_sklearn.ipynb | 8 ++--- khiops/samples/samples_sklearn.py | 8 ++--- khiops/sklearn/estimators.py | 45 +++++++--------------------- tests/test_estimator_attributes.py | 5 ++-- 5 files changed, 24 insertions(+), 50 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index c6e364b6..c8c2daaf 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -72,7 +72,7 @@ Samples print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") print("Top 5 used features, among those present in the dataset") @@ -198,7 +198,7 @@ Samples print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") # Predict the classes on the test dataset @@ -318,7 +318,7 @@ Samples print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") # Predict the class on the test dataset @@ -549,7 +549,7 @@ Samples print(f"Features selected : {khr.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khr.feature_used_names_[:3]): - print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khr.feature_used_importances_[i]}") print("---") # Predict the values on the test dataset diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 66cdac58..6362c282 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -58,7 +58,7 @@ "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", - " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n", "print(\"---\")\n", "\n", "print(\"Top 5 used features, among those present in the dataset\")\n", @@ -210,7 +210,7 @@ "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", - " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n", "print(\"---\")\n", "\n", "# Predict the classes on the test dataset\n", @@ -356,7 +356,7 @@ "print(f\"Features selected : {khc.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", - " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n", "print(\"---\")\n", "\n", "# Predict the class on the test dataset\n", @@ -639,7 +639,7 @@ "print(f\"Features selected : {khr.n_features_used_}\")\n", "print(\"Top 3 used features\")\n", "for i, feature in enumerate(khr.feature_used_names_[:3]):\n", - " print(f\"{feature} - Importance: {khr.feature_used_importances_[i][2]}\")\n", + " print(f\"{feature} - Importance: {khr.feature_used_importances_[i]}\")\n", "print(\"---\")\n", "\n", "# Predict the values on the test dataset\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index e5dbdd67..06629435 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -61,7 +61,7 @@ def khiops_classifier(): print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") print("Top 5 used features, among those present in the dataset") @@ -193,7 +193,7 @@ def khiops_classifier_text(): print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") # Predict the classes on the test dataset @@ -321,7 +321,7 @@ def khiops_classifier_multitable_snowflake(): print(f"Features selected : {khc.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khc.feature_used_names_[:3]): - print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") # Predict the class on the test dataset @@ -560,7 +560,7 @@ def khiops_regressor(): print(f"Features selected : {khr.n_features_used_}") print("Top 3 used features") for i, feature in enumerate(khr.feature_used_names_[:3]): - print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}") + print(f"{feature} - Importance: {khr.feature_used_importances_[i]}") print("---") # Predict the values on the test dataset diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 160004e6..be3db1d1 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1567,9 +1567,7 @@ def _fit_training_post_process(self, ds): for feature_name in self.feature_names_in_: if feature_name in feature_used_names: feature_index = np.where(feature_used_names == feature_name) - feature_importance = self.feature_used_importances_[ - feature_index - ].ravel()[2] + feature_importance = self.feature_used_importances_[feature_index][0] else: feature_importance = 0.0 feature_importances.append(feature_importance) @@ -1672,10 +1670,7 @@ def get_feature_used_statistics(self, modeling_report): [var.name for var in modeling_report.selected_variables] ) feature_used_importances_ = np.array( - [ - [var.level, var.weight, var.importance] - for var in modeling_report.selected_variables - ] + [var.importance for var in modeling_report.selected_variables] ) # Return empty arrays if no selected variables are available else: @@ -1810,20 +1805,10 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): The number of features used by the classifier. feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) Names of the features used by the classifier. - feature_used_importances_ : `ndarray ` of shape (n_features_used\_, 3) - Level, Weight and Importance of the features used by the classifier: - - - Level: A measure of the predictive importance of the feature taken - individually. It ranges between 0 (no predictive interest) and 1 (optimal - predictive importance). - - - Weight: A measure of the predictive importance of the feature taken relative - to all features selected by the classifier. It ranges between 0 (little - contribution to the model) and 1 (large contribution to the model). - - - Importance: Average of the exact Shapley values of each used feature - across the training data. - + feature_used_importances_ : `ndarray ` of shape (n_features_used\_,) + Importance of the features used by the classifier. The importance is + computed as the average of the exact Shapley values of each used feature + across the training dataset. is_multitable_model_ : bool ``True`` if the model was fitted on a multi-table dataset. model_ : `.DictionaryDomain` @@ -2238,20 +2223,10 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): The number of features used by the classifier. feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) Names of the features used by the classifier. - feature_used_importances_ : `ndarray ` of shape (n_features_used\_, 3) - Level, Weight and Importance of the features used by the classifier: - - - Level: A measure of the predictive importance of the feature taken - individually. It ranges between 0 (no predictive interest) and 1 (optimal - predictive importance). - - - Weight: A measure of the predictive importance of the feature taken relative - to all features selected by the classifier. It ranges between 0 (little - contribution to the model) and 1 (large contribution to the model). - - - Importance: Average of the exact Shapley values of each used feature - across the training data. - + feature_used_importances_ : `ndarray ` of shape (n_features_used\_,) + Importance of the features used by the classifier. The importance is + computed as the average of the exact Shapley values of each used feature + across the training dataset. is_multitable_model_ : bool ``True`` if the model was fitted on a multi-table dataset. model_ : `.DictionaryDomain` diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index 3c42129b..b0fc270e 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -9,7 +9,6 @@ import warnings from os import path -import numpy as np import pandas as pd from khiops import core as kh @@ -89,7 +88,7 @@ def assert_attribute_values_ok(self, model, X, y): for var in modeling_report.get_snb_predictor().selected_variables ] feature_used_importances_report = [ - [var.level, var.weight, var.importance] + var.importance for var in modeling_report.get_snb_predictor().selected_variables ] @@ -113,7 +112,7 @@ def assert_attribute_values_ok(self, model, X, y): if feature_name in feature_used_names: feature_index = feature_used_names.index(feature_name) feature_importances_report.append( - feature_used_importances_report[feature_index][2] + feature_used_importances_report[feature_index] ) else: feature_importances_report.append(0.0) From 6a59b45372e951fb5524446b05fe5d8f8d6ab8a4 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:40:42 +0200 Subject: [PATCH 5/9] Simplify feature_importances_ computation --- doc/samples/samples_sklearn.rst | 2 +- khiops/samples/samples_sklearn.ipynb | 2 +- khiops/samples/samples_sklearn.py | 2 +- khiops/sklearn/estimators.py | 24 +++++++++++------------- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index c8c2daaf..ccc4a6b0 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -78,7 +78,7 @@ Samples print("Top 5 used features, among those present in the dataset") for feature, importance in sorted( zip(khc.feature_names_in_, khc.feature_importances_), - key=lambda imp: float(imp[1]), + key=lambda imp: imp[1], reverse=True, )[:5]: print(f"{feature} - Importance: {importance}") diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 6362c282..52c2bd4e 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -64,7 +64,7 @@ "print(\"Top 5 used features, among those present in the dataset\")\n", "for feature, importance in sorted(\n", " zip(khc.feature_names_in_, khc.feature_importances_),\n", - " key=lambda imp: float(imp[1]),\n", + " key=lambda imp: imp[1],\n", " reverse=True,\n", ")[:5]:\n", " print(f\"{feature} - Importance: {importance}\")\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 06629435..5195aca5 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -67,7 +67,7 @@ def khiops_classifier(): print("Top 5 used features, among those present in the dataset") for feature, importance in sorted( zip(khc.feature_names_in_, khc.feature_importances_), - key=lambda imp: float(imp[1]), + key=lambda imp: imp[1], reverse=True, )[:5]: print(f"{feature} - Importance: {importance}") diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index be3db1d1..fa940c58 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1557,21 +1557,19 @@ def _fit_training_post_process(self, ds): ) self.feature_used_names_ = feature_used_names_ self.feature_used_importances_ = feature_used_importances_ - self.n_features_used_ = len(self.feature_used_names_) - - # feature_used_names_ is not set if no variable is selected in the model - feature_used_names = getattr(self, "feature_used_names_", []) + else: + self.feature_used_names_ = [] + self.feature_used_importances_ = np.array([]) + self.n_features_used_ = len(self.feature_used_names_) # Compute feature importances - feature_importances = [] - for feature_name in self.feature_names_in_: - if feature_name in feature_used_names: - feature_index = np.where(feature_used_names == feature_name) - feature_importance = self.feature_used_importances_[feature_index][0] - else: - feature_importance = 0.0 - feature_importances.append(feature_importance) - self.feature_importances_ = np.array(feature_importances) + self.feature_importances_ = np.zeros(self.feature_names_in_.shape) + for i, feature_name in enumerate(self.feature_names_in_): + if feature_name in self.feature_used_names_: + feature_index = np.where(self.feature_used_names_ == feature_name) + self.feature_importances_[i] = self.feature_used_importances_[ + feature_index + ][0] def __sklearn_tags__(self): # If we don't implement this trivial method it's not found by the sklearn. This From 2e47c3878439bc6fbfcdfb9c1d4e98542e8cb89b Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:40:22 +0200 Subject: [PATCH 6/9] Show the first 5 used feature importances in the sklearn classifier sample This enables a more even parallel with the (input) feature importances. --- doc/samples/samples_sklearn.rst | 4 ++-- khiops/samples/samples_sklearn.ipynb | 4 ++-- khiops/samples/samples_sklearn.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index ccc4a6b0..00eaa04d 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -70,8 +70,8 @@ Samples # Show the feature importance info print(f"Features selected : {khc.n_features_used_}") - print("Top 3 used features") - for i, feature in enumerate(khc.feature_used_names_[:3]): + print("Top 5 used features") + for i, feature in enumerate(khc.feature_used_names_[:5]): print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 52c2bd4e..25e20d7e 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -56,8 +56,8 @@ "\n", "# Show the feature importance info\n", "print(f\"Features selected : {khc.n_features_used_}\")\n", - "print(\"Top 3 used features\")\n", - "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", + "print(\"Top 5 used features\")\n", + "for i, feature in enumerate(khc.feature_used_names_[:5]):\n", " print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n", "print(\"---\")\n", "\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 5195aca5..55c865b4 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -59,8 +59,8 @@ def khiops_classifier(): # Show the feature importance info print(f"Features selected : {khc.n_features_used_}") - print("Top 3 used features") - for i, feature in enumerate(khc.feature_used_names_[:3]): + print("Top 5 used features") + for i, feature in enumerate(khc.feature_used_names_[:5]): print(f"{feature} - Importance: {khc.feature_used_importances_[i]}") print("---") From b14fa829e44020295e9e0d523d9fa09959d9ccbe Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:34:29 +0200 Subject: [PATCH 7/9] Fix docstring reference to metadata in dictionary getters --- khiops/core/dictionary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/khiops/core/dictionary.py b/khiops/core/dictionary.py index 31721205..7b1c9990 100644 --- a/khiops/core/dictionary.py +++ b/khiops/core/dictionary.py @@ -733,7 +733,7 @@ def get_value(self, key): Returns ------- - `Metadata` + `MetaData` Metadata value associated to the specified key. ``None`` is returned if the metadata key is not found. """ @@ -1267,7 +1267,7 @@ def get_value(self, key): Returns ------- - `Metadata` + `MetaData` Metadata value associated to the specified key. ``None`` is returned if the metadata key is not found. """ @@ -1527,7 +1527,7 @@ def get_value(self, key): Returns ------- - `Metadata` + `MetaData` Metadata value associated to the specified key. ``None`` is returned if the metadata key is not found. """ From c2db9121d17a74654c99fbcf92594bef7635dbac Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:36:21 +0200 Subject: [PATCH 8/9] Expose default construction rules and fix relevant documentation - separate construction rules into: - rules applied by default (`DEFAULT_CONSTRUCTION_RULES`); - calendar-related rules (`CALENDRICAL_CONSTRUCTION_RULES`); - document the construction rules; - fix the `construction_rules` parameter documentation in the Core API; - fix the `n_features` parameter documentation the Sklearn estimator API; - update the `feature_importances_` attribute documentation in the Sklearn estimator API accordingly. --- khiops/core/api.py | 58 +++++++++++++++++++++++++----------- khiops/samples/samples.ipynb | 2 +- khiops/samples/samples.py | 2 +- khiops/sklearn/estimators.py | 34 ++++++++++++--------- 4 files changed, 62 insertions(+), 34 deletions(-) diff --git a/khiops/core/api.py b/khiops/core/api.py index 166d97aa..561b29ba 100644 --- a/khiops/core/api.py +++ b/khiops/core/api.py @@ -31,18 +31,10 @@ from khiops.core.internals.runner import get_runner from khiops.core.internals.task import get_task_registry -# List of all available construction rules in the Khiops tool -all_construction_rules = [ - "Day", - "DecimalTime", - "DecimalWeekDay", - "DecimalYear", - "DecimalYearTS", - "GetDate", - "GetTime", +# Construction rules +DEFAULT_CONSTRUCTION_RULES = [ "GetValue", "GetValueC", - "LocalTimestamp", "TableCount", "TableCountDistinct", "TableMax", @@ -53,9 +45,37 @@ "TableSelection", "TableStdDev", "TableSum", +] +"""List of construction rules that Khiops uses by default + +.. note:: + These are all the multi-table rules. +""" # pylint: disable=pointless-string-statement + +CALENDRICAL_CONSTRUCTION_RULES = [ + "Day", + "DecimalTime", + "DecimalWeekDay", + "DecimalYear", + "DecimalYearTS", + "GetDate", + "GetTime", + "LocalTimestamp", "WeekDay", "YearDay", ] +"""List of calendrical construction rules + +These rules include: date, time and timestamp rules. + +.. note:: + These rules are not enabled by default. The user needs to explicitly + select each of them via the ``construction_rules`` parameter of the + relevant Core API functions. +""" # pylint: disable=pointless-string-statement + +# List of all available construction rules in the Khiops tool +ALL_CONSTRUCTION_RULES = DEFAULT_CONSTRUCTION_RULES + CALENDRICAL_CONSTRUCTION_RULES ########################## # Private module methods # @@ -758,8 +778,9 @@ def train_predictor( max_constructed_variables : int, default 1000 Maximum number of variables to construct. construction_rules : list of str, optional - Allowed rules for the automatic variable construction. If not set it uses all - possible rules. + Allowed rules for the automatic variable construction. If not set, Khiops + uses the multi-table construction rules listed in + `DEFAULT_CONSTRUCTION_RULES`. max_text_features : int, default 10000 Maximum number of text features to construct. text_features : str, default "words" @@ -1190,21 +1211,22 @@ def train_recoder( max_constructed_variables : int, default 100 Maximum number of variables to construct. construction_rules : list of str, optional - Allowed rules for the automatic variable construction. If not set it uses all - possible rules. + Allowed rules for the automatic variable construction. If not set, Khiops + uses the multi-table construction rules listed in + `DEFAULT_CONSTRUCTION_RULES`. max_text_features : int, default 10000 Maximum number of text features to construct. text_features : str, default "words" Type of the text features. Can be either one of: - - "words": sequences of non-space characters - - "ngrams": sequences of bytes - - "tokens": user-defined + - "words": sequences of non-space characters + - "ngrams": sequences of bytes + - "tokens": user-defined max_trees : int, default 10 Maximum number of trees to construct. max_pairs : int, default 0 - Maximum number of variables pairs to construct. + Maximum number of variable pairs to construct. specific_pairs : list of tuple, optional User-specified pairs as a list of 2-tuples of feature names. If a given tuple contains only one non-empty feature name, then it generates all the pairs diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index 13242e98..8070cc12 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -503,7 +503,7 @@ "metadata": {}, "source": [ "### `train_predictor_mt_with_specific_rules()`\n\n", - "Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.all_construction_rules``\n \n" + "Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.ALL_CONSTRUCTION_RULES``\n \n" ] }, { diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py index c5df0703..daca4aab 100644 --- a/khiops/samples/samples.py +++ b/khiops/samples/samples.py @@ -432,7 +432,7 @@ def train_predictor_mt_with_specific_rules(): It is the same as `.train_predictor_mt` but with the specification of the allowed variable construction rules. The list of available rules is found in the field - ``kh.all_construction_rules`` + ``kh.ALL_CONSTRUCTION_RULES`` """ # Imports import os diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index fa940c58..cbd71823 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1704,8 +1704,9 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): Parameters ---------- n_features : int, default 100 - *Multi-table only* : Maximum number of multi-table aggregate features to - construct. See :doc:`/multi_table_primer` for more details. + Maximum number of features to construct automatically. See + :doc:`/multi_table_primer` for more details on the multi-table-specific + features. n_pairs : int, default 0 Maximum number of pair features to construct. These features are 2D grid partitions of univariate feature pairs. The grid is optimized such that in each @@ -1740,8 +1741,9 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): Pairs specified with ``specific_pairs`` have top priority: they are constructed first. construction_rules : list of str, optional - Allowed rules for the automatic feature construction. If not set, it uses all - possible rules. + Allowed rules for the automatic feature construction. If not set, Khiops + uses the multi-table construction rules listed in + `kh.DEFAULT_CONSTRUCTION_RULES ` group_target_value : bool, default ``False`` Allows grouping of the target values in classification. It can substantially increase the training time. @@ -1794,7 +1796,7 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): one must disable most AutoML capabilities of Khiops, namely: - the training dataset must be monotable; - - no timestamp column should be used in the training dataset; + - the ``n_features`` parameter must be set to 0; - the ``n_trees`` parameter must be set to 0; - the ``n_pairs`` parameter must be left to its default value, 0; - the ``n_text_features`` parameter must be set to 0. @@ -2159,8 +2161,9 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): Parameters ---------- n_features : int, default 100 - *Multi-table only* : Maximum number of multi-table aggregate features to - construct. See :doc:`/multi_table_primer` for more details. + Maximum number of features to construct automatically. See + :doc:`/multi_table_primer` for more details on the multi-table-specific + features. n_selected_features : int, default 0 Maximum number of features to be selected in the SNB predictor. If equal to 0 it selects all the features kept in the training. @@ -2168,8 +2171,9 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): Maximum number of features to be evaluated in the SNB predictor training. If equal to 0 it evaluates all informative features. construction_rules : list of str, optional - Allowed rules for the automatic feature construction. If not set, it uses all - possible rules. + Allowed rules for the automatic feature construction. If not set, Khiops + uses the multi-table construction rules listed in + `kh.DEFAULT_CONSTRUCTION_RULES `. verbose : bool, default ``False`` If ``True`` it prints debug information and it does not erase temporary files when fitting, predicting or transforming. @@ -2212,7 +2216,7 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): one must disable most AutoML capabilities of Khiops, namely: - the training dataset must be monotable; - - no timestamp column should be used in the training dataset; + - the ``n_features`` parameter must be set to 0; - the ``n_trees`` parameter must be set to 0; - the ``n_pairs`` parameter must be left to its default value, 0; - the ``n_text_features`` parameter must be set to 0. @@ -2386,8 +2390,9 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): categorical_target : bool, default ``True`` ``True`` if the target column is categorical. n_features : int, default 100 - *Multi-table only* : Maximum number of multi-table aggregate features to - construct. See :doc:`/multi_table_primer` for more details. + Maximum number of features to construct automatically. See + :doc:`/multi_table_primer` for more details on the multi-table-specific + features. n_pairs : int, default 0 Maximum number of pair features to construct. These features are 2D grid partitions of univariate feature pairs. The grid is optimized such that in each @@ -2415,8 +2420,9 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): Pairs specified with ``specific_pairs`` have top priority: they are constructed first. construction_rules : list of str, optional - Allowed rules for the automatic feature construction. If not set, it uses all - possible rules. + Allowed rules for the automatic feature construction. If not set, Khiops + uses the multi-table construction rules listed in + `kh.DEFAULT_CONSTRUCTION_RULES `. informative_features_only : bool, default ``True`` If ``True`` keeps only informative features. group_target_value : bool, default ``False`` From 9d19f9a727467ba4384d3816e89ffb3a87f2d43c Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:33:43 +0200 Subject: [PATCH 9/9] Update CHANGELOG --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cb1ae2a..0d07572c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,13 @@ comments, and dictionary and variable block internal comments. - (`core`) Dictionary `Rule` class and supporting API for serializing `Rule` instances. - (`core`) New way to add a variable to a dictionary using a complete specification. +- (`core`) New API constants for rules used in automatic variable construction: + - `DEFAULT_CONSTRUCTION_RULES`: names of table and entity construction rules, + which are applied by default + - `CALENDRICAL_CONSTRUCTION_RULES`: names of date, time and timestamp rules. - (`sklearn`) `Text` Khiops type support at the estimator level. +- (`sklearn`) The `feature_names_in_` and `feature_importances_` Khiops + classifier and regressor estimator attributes. ### Changed - (`core`) Dictionary API (DictionaryDomain, Dictionary, MetaData),