From 4f772b2089865979643708766d6437d17961674d Mon Sep 17 00:00:00 2001
From: NAIR BENREKIA Nour Eddine Yassine INNOV/IT-S
 <noureddineyassine.nairbenrekia@orange.com>
Date: Mon, 21 Oct 2024 15:41:17 +0200
Subject: [PATCH] Support float and boolean targets in KhiopsClassifier

---
 CHANGELOG.md                       |  1 +
 khiops/sklearn/estimators.py       | 35 +++++++++++++++++++++++++-----
 tests/test_sklearn_output_types.py |  7 +++++-
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c858efc..6eec6fb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 
 - (General) `visualize_report` helper function to open reports with the Khiops Visualization and
   Khiops Co-Visualization app.
+- (`sklearn`) Support for `float` and `boolean` targets in `KhiopsClassifier`.
 
 ## 10.2.3.1 - 2024-11-27
 
diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
index 47bfc16b..c772b9ac 100644
--- a/khiops/sklearn/estimators.py
+++ b/khiops/sklearn/estimators.py
@@ -153,6 +153,7 @@ def _check_categorical_target_type(ds):
         or pd.api.types.is_string_dtype(ds.target_column.dtype)
         or pd.api.types.is_integer_dtype(ds.target_column.dtype)
         or pd.api.types.is_float_dtype(ds.target_column.dtype)
+        or pd.api.types.is_bool_dtype(ds.target_column.dtype)
     ):
         raise ValueError(
             f"'y' has invalid type '{ds.target_column_type}'. "
@@ -2093,6 +2094,24 @@ def _is_real_target_dtype_integer(self):
             )
         )
 
+    def _is_real_target_dtype_float(self):
+        return self._original_target_dtype is not None and (
+            pd.api.types.is_float_dtype(self._original_target_dtype)
+            or (
+                isinstance(self._original_target_dtype, pd.CategoricalDtype)
+                and pd.api.types.is_float_dtype(self._original_target_dtype.categories)
+            )
+        )
+
+    def _is_real_target_dtype_bool(self):
+        return self._original_target_dtype is not None and (
+            pd.api.types.is_bool_dtype(self._original_target_dtype)
+            or (
+                isinstance(self._original_target_dtype, pd.CategoricalDtype)
+                and pd.api.types.is_bool_dtype(self._original_target_dtype.categories)
+            )
+        )
+
     def _sorted_prob_variable_names(self):
         """Returns the model probability variable names in the order of self.classes_"""
         assert self.is_fitted_, "Model not fit yet"
@@ -2195,11 +2214,15 @@ def _fit_training_post_process(self, ds):
             for key in variable.meta_data.keys:
                 if key.startswith("TargetProb"):
                     self.classes_.append(variable.meta_data.get_value(key))
-        if ds.is_in_memory and self._is_real_target_dtype_integer():
-            self.classes_ = [int(class_value) for class_value in self.classes_]
+        if ds.is_in_memory:
+            if self._is_real_target_dtype_integer():
+                self.classes_ = [int(class_value) for class_value in self.classes_]
+            elif self._is_real_target_dtype_float():
+                self.classes_ = [float(class_value) for class_value in self.classes_]
+            elif self._is_real_target_dtype_bool():
+                self.classes_ = [class_value == "True" for class_value in self.classes_]
             self.classes_.sort()
         self.classes_ = column_or_1d(self.classes_)
-
         # Count number of classes
         self.n_classes_ = len(self.classes_)
 
@@ -2259,13 +2282,11 @@ def predict(self, X):
         """
         # Call the parent's method
         y_pred = super().predict(X)
-
         # Adjust the data type according to the original target type
         # Note: String is coerced explictly because astype does not work as expected
         if isinstance(y_pred, pd.DataFrame):
             # Transform to numpy.ndarray
             y_pred = y_pred.to_numpy(copy=False).ravel()
-
             # If integer and string just transform
             if pd.api.types.is_integer_dtype(self._original_target_dtype):
                 y_pred = y_pred.astype(self._original_target_dtype)
@@ -2275,6 +2296,10 @@ def predict(self, X):
                 self._original_target_dtype
             ):
                 y_pred = y_pred.astype(str, copy=False)
+            elif pd.api.types.is_float_dtype(self._original_target_dtype):
+                y_pred = y_pred.astype(float, copy=False)
+            elif pd.api.types.is_bool_dtype(self._original_target_dtype):
+                y_pred = y_pred.astype(bool, copy=False)
             # If category first coerce the type to the categories' type
             else:
                 assert isinstance(self._original_target_dtype, pd.CategoricalDtype), (
diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py
index 7e728f70..94198b31 100644
--- a/tests/test_sklearn_output_types.py
+++ b/tests/test_sklearn_output_types.py
@@ -54,11 +54,12 @@ def test_classifier_output_types(self):
         """Test the KhiopsClassifier output types and classes of predict* methods"""
         X, y = create_iris()
         X_mt, X_sec_mt, _ = create_iris_mt()
-
         fixtures = {
             "ys": {
                 "int": y,
                 "int binary": y.replace({0: 0, 1: 0, 2: 1}),
+                "float": y.astype(float),
+                "bool": y.replace({0: True, 1: True, 2: False}),
                 "string": y.replace({0: "se", 1: "vi", 2: "ve"}),
                 "string binary": y.replace({0: "vi_or_se", 1: "vi_or_se", 2: "ve"}),
                 "int as string": y.replace({0: "8", 1: "9", 2: "10"}),
@@ -69,6 +70,8 @@ def test_classifier_output_types(self):
             "y_type_check": {
                 "int": pd.api.types.is_integer_dtype,
                 "int binary": pd.api.types.is_integer_dtype,
+                "float": pd.api.types.is_float_dtype,
+                "bool": pd.api.types.is_bool_dtype,
                 "string": pd.api.types.is_string_dtype,
                 "string binary": pd.api.types.is_string_dtype,
                 "int as string": pd.api.types.is_string_dtype,
@@ -79,6 +82,8 @@ def test_classifier_output_types(self):
             "expected_classes": {
                 "int": column_or_1d([0, 1, 2]),
                 "int binary": column_or_1d([0, 1]),
+                "float": column_or_1d([0.0, 1.0, 2.0]),
+                "bool": column_or_1d([False, True]),
                 "string": column_or_1d(["se", "ve", "vi"]),
                 "string binary": column_or_1d(["ve", "vi_or_se"]),
                 "int as string": column_or_1d(["10", "8", "9"]),