Skip to content

Commit a404e40

Browse files
committed
support float and boolean targets in KhiopsClassifier
1 parent 7cf8c15 commit a404e40

File tree

7 files changed

+339
-10
lines changed

7 files changed

+339
-10
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
- Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1.
77
- Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists.
88

9+
## Unreleased
10+
11+
### Added
12+
- (`sklearn`) Support for boolean and float targets in `KhiopsClassifier`.
13+
914
## 10.3.0.0 - 2025-02-10
1015

1116
### Fixed

doc/samples/samples_sklearn.rst

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,82 @@ Samples
9898
# If you have Khiops Visualization installed you may open the report as follows
9999
# khc.export_report_file("report.khj")
100100
# kh.visualize_report("report.khj")
101+
.. autofunction:: khiops_classifier_boolean_target
102+
.. code-block:: python
103+
104+
# Imports
105+
import os
106+
import pandas as pd
107+
from khiops import core as kh
108+
from khiops.sklearn import KhiopsClassifier
109+
from sklearn.model_selection import train_test_split
110+
111+
# Load the dataset into a pandas dataframe
112+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
113+
adult_df = pd.read_csv(adult_path, sep="\t")
114+
115+
# Split the whole dataframe into train and test (70%-30%)
116+
adult_train_df, adult_test_df = train_test_split(
117+
adult_df, test_size=0.3, random_state=1
118+
)
119+
120+
# Split the dataset into:
121+
# - the X feature table
122+
# - the y target vector ("class" column)
123+
X_train = adult_train_df.drop("class", axis=1)
124+
X_test = adult_test_df.drop("class", axis=1)
125+
y_train = adult_train_df["class"]
126+
y_train.replace({"less": False, "more": True}, inplace=True)
127+
128+
# Create the classifier object
129+
khc = KhiopsClassifier()
130+
131+
# Train the classifier
132+
khc.fit(X_train, y_train)
133+
134+
# Predict the classes on the test dataset
135+
y_test_pred = khc.predict(X_test)
136+
print("Predicted classes (first 10):")
137+
print(y_test_pred[0:10])
138+
print("---")
139+
.. autofunction:: khiops_classifier_float_target
140+
.. code-block:: python
141+
142+
# Imports
143+
import os
144+
import pandas as pd
145+
from khiops import core as kh
146+
from khiops.sklearn import KhiopsClassifier
147+
from sklearn.model_selection import train_test_split
148+
149+
# Load the dataset into a pandas dataframe
150+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
151+
adult_df = pd.read_csv(adult_path, sep="\t")
152+
153+
# Split the whole dataframe into train and test (70%-30%)
154+
adult_train_df, adult_test_df = train_test_split(
155+
adult_df, test_size=0.3, random_state=1
156+
)
157+
158+
# Split the dataset into:
159+
# - the X feature table
160+
# - the y target vector ("class" column)
161+
X_train = adult_train_df.drop("class", axis=1)
162+
X_test = adult_test_df.drop("class", axis=1)
163+
y_train = adult_train_df["class"]
164+
y_train.replace({"less": 0.0, "more": 1.0}, inplace=True)
165+
166+
# Create the classifier object
167+
khc = KhiopsClassifier()
168+
169+
# Train the classifier
170+
khc.fit(X_train, y_train)
171+
172+
# Predict the classes on the test dataset
173+
y_test_pred = khc.predict(X_test)
174+
print("Predicted classes (first 10):")
175+
print(y_test_pred[0:10])
176+
print("---")
101177
.. autofunction:: khiops_classifier_multiclass
102178
.. code-block:: python
103179

khiops/samples/samples_sklearn.ipynb

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,108 @@
8686
"# kh.visualize_report(\"report.khj\")"
8787
]
8888
},
89+
{
90+
"cell_type": "markdown",
91+
"metadata": {},
92+
"source": [
93+
"### `khiops_classifier_boolean_target()`\n\n",
94+
"Trains a `.KhiopsClassifier` on a monotable dataframe\n where the target is boolean\n"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": [
103+
"# Imports\n",
104+
"import os\n",
105+
"import pandas as pd\n",
106+
"from khiops import core as kh\n",
107+
"from khiops.sklearn import KhiopsClassifier\n",
108+
"from sklearn.model_selection import train_test_split\n",
109+
"\n",
110+
"# Load the dataset into a pandas dataframe\n",
111+
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
112+
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
113+
"\n",
114+
"# Split the whole dataframe into train and test (70%-30%)\n",
115+
"adult_train_df, adult_test_df = train_test_split(\n",
116+
" adult_df, test_size=0.3, random_state=1\n",
117+
")\n",
118+
"\n",
119+
"# Split the dataset into:\n",
120+
"# - the X feature table\n",
121+
"# - the y target vector (\"class\" column)\n",
122+
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
123+
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
124+
"y_train = adult_train_df[\"class\"]\n",
125+
"y_train.replace({\"less\": False, \"more\": True}, inplace=True)\n",
126+
"\n",
127+
"# Create the classifier object\n",
128+
"khc = KhiopsClassifier()\n",
129+
"\n",
130+
"# Train the classifier\n",
131+
"khc.fit(X_train, y_train)\n",
132+
"\n",
133+
"# Predict the classes on the test dataset\n",
134+
"y_test_pred = khc.predict(X_test)\n",
135+
"print(\"Predicted classes (first 10):\")\n",
136+
"print(y_test_pred[0:10])\n",
137+
"print(\"---\")"
138+
]
139+
},
140+
{
141+
"cell_type": "markdown",
142+
"metadata": {},
143+
"source": [
144+
"### `khiops_classifier_float_target()`\n\n",
145+
"Trains a `.KhiopsClassifier` on a monotable dataframe\n where the target is float\n"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": null,
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"# Imports\n",
155+
"import os\n",
156+
"import pandas as pd\n",
157+
"from khiops import core as kh\n",
158+
"from khiops.sklearn import KhiopsClassifier\n",
159+
"from sklearn.model_selection import train_test_split\n",
160+
"\n",
161+
"# Load the dataset into a pandas dataframe\n",
162+
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
163+
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
164+
"\n",
165+
"# Split the whole dataframe into train and test (70%-30%)\n",
166+
"adult_train_df, adult_test_df = train_test_split(\n",
167+
" adult_df, test_size=0.3, random_state=1\n",
168+
")\n",
169+
"\n",
170+
"# Split the dataset into:\n",
171+
"# - the X feature table\n",
172+
"# - the y target vector (\"class\" column)\n",
173+
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
174+
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
175+
"y_train = adult_train_df[\"class\"]\n",
176+
"y_train.replace({\"less\": 0.0, \"more\": 1.0}, inplace=True)\n",
177+
"\n",
178+
"# Create the classifier object\n",
179+
"khc = KhiopsClassifier()\n",
180+
"\n",
181+
"# Train the classifier\n",
182+
"khc.fit(X_train, y_train)\n",
183+
"\n",
184+
"# Predict the classes on the test dataset\n",
185+
"y_test_pred = khc.predict(X_test)\n",
186+
"print(\"Predicted classes (first 10):\")\n",
187+
"print(y_test_pred[0:10])\n",
188+
"print(\"---\")"
189+
]
190+
},
89191
{
90192
"cell_type": "markdown",
91193
"metadata": {},

khiops/samples/samples_sklearn.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,86 @@ def khiops_classifier():
8989
# kh.visualize_report("report.khj")
9090

9191

92+
def khiops_classifier_boolean_target():
93+
"""Trains a `.KhiopsClassifier` on a monotable dataframe
94+
where the target is boolean"""
95+
# Imports
96+
import os
97+
import pandas as pd
98+
from khiops import core as kh
99+
from khiops.sklearn import KhiopsClassifier
100+
from sklearn.model_selection import train_test_split
101+
102+
# Load the dataset into a pandas dataframe
103+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
104+
adult_df = pd.read_csv(adult_path, sep="\t")
105+
106+
# Split the whole dataframe into train and test (70%-30%)
107+
adult_train_df, adult_test_df = train_test_split(
108+
adult_df, test_size=0.3, random_state=1
109+
)
110+
111+
# Split the dataset into:
112+
# - the X feature table
113+
# - the y target vector ("class" column)
114+
X_train = adult_train_df.drop("class", axis=1)
115+
X_test = adult_test_df.drop("class", axis=1)
116+
y_train = adult_train_df["class"]
117+
y_train.replace({"less": False, "more": True}, inplace=True)
118+
119+
# Create the classifier object
120+
khc = KhiopsClassifier()
121+
122+
# Train the classifier
123+
khc.fit(X_train, y_train)
124+
125+
# Predict the classes on the test dataset
126+
y_test_pred = khc.predict(X_test)
127+
print("Predicted classes (first 10):")
128+
print(y_test_pred[0:10])
129+
print("---")
130+
131+
132+
def khiops_classifier_float_target():
133+
"""Trains a `.KhiopsClassifier` on a monotable dataframe
134+
where the target is float"""
135+
# Imports
136+
import os
137+
import pandas as pd
138+
from khiops import core as kh
139+
from khiops.sklearn import KhiopsClassifier
140+
from sklearn.model_selection import train_test_split
141+
142+
# Load the dataset into a pandas dataframe
143+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
144+
adult_df = pd.read_csv(adult_path, sep="\t")
145+
146+
# Split the whole dataframe into train and test (70%-30%)
147+
adult_train_df, adult_test_df = train_test_split(
148+
adult_df, test_size=0.3, random_state=1
149+
)
150+
151+
# Split the dataset into:
152+
# - the X feature table
153+
# - the y target vector ("class" column)
154+
X_train = adult_train_df.drop("class", axis=1)
155+
X_test = adult_test_df.drop("class", axis=1)
156+
y_train = adult_train_df["class"]
157+
y_train.replace({"less": 0.0, "more": 1.0}, inplace=True)
158+
159+
# Create the classifier object
160+
khc = KhiopsClassifier()
161+
162+
# Train the classifier
163+
khc.fit(X_train, y_train)
164+
165+
# Predict the classes on the test dataset
166+
y_test_pred = khc.predict(X_test)
167+
print("Predicted classes (first 10):")
168+
print(y_test_pred[0:10])
169+
print("---")
170+
171+
92172
def khiops_classifier_multiclass():
93173
"""Trains a multiclass `.KhiopsClassifier` on a monotable dataframe"""
94174
# Imports
@@ -1025,6 +1105,8 @@ def khiops_classifier_multitable_star_file():
10251105

10261106
exported_samples = [
10271107
khiops_classifier,
1108+
khiops_classifier_boolean_target,
1109+
khiops_classifier_float_target,
10281110
khiops_classifier_multiclass,
10291111
khiops_classifier_multitable_star,
10301112
khiops_classifier_multitable_snowflake,

khiops/sklearn/dataset.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -738,8 +738,22 @@ def _init_target_column(self, y):
738738
if isinstance(y, str):
739739
y_checked = y
740740
else:
741-
y_checked = column_or_1d(y, warn=True)
742-
741+
if hasattr(y, "dtype"):
742+
if isinstance(y.dtype, pd.CategoricalDtype):
743+
y_checked = column_or_1d(
744+
y, warn=True, dtype=y.dtype.categories.dtype
745+
)
746+
else:
747+
y_checked = column_or_1d(y, warn=True, dtype=y.dtype)
748+
elif hasattr(y, "dtypes"):
749+
if isinstance(y.dtypes[0], pd.CategoricalDtype):
750+
y_checked = column_or_1d(
751+
y, warn=True, dtype=y.dtypes[0].categories.dtype
752+
)
753+
else:
754+
y_checked = column_or_1d(y, warn=True)
755+
else:
756+
y_checked = column_or_1d(y, warn=True)
743757
# Check the target type coherence with those of X's tables
744758
if isinstance(
745759
self.main_table, (PandasTable, SparseTable, NumpyTable)

khiops/sklearn/estimators.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _check_categorical_target_type(ds):
154154
or pd.api.types.is_string_dtype(ds.target_column.dtype)
155155
or pd.api.types.is_integer_dtype(ds.target_column.dtype)
156156
or pd.api.types.is_float_dtype(ds.target_column.dtype)
157+
or pd.api.types.is_bool_dtype(ds.target_column.dtype)
157158
):
158159
raise ValueError(
159160
f"'y' has invalid type '{ds.target_column_type}'. "
@@ -2123,6 +2124,24 @@ def _is_real_target_dtype_integer(self):
21232124
)
21242125
)
21252126

2127+
def _is_real_target_dtype_float(self):
2128+
return self._original_target_dtype is not None and (
2129+
pd.api.types.is_float_dtype(self._original_target_dtype)
2130+
or (
2131+
isinstance(self._original_target_dtype, pd.CategoricalDtype)
2132+
and pd.api.types.is_float_dtype(self._original_target_dtype.categories)
2133+
)
2134+
)
2135+
2136+
def _is_real_target_dtype_bool(self):
2137+
return self._original_target_dtype is not None and (
2138+
pd.api.types.is_bool_dtype(self._original_target_dtype)
2139+
or (
2140+
isinstance(self._original_target_dtype, pd.CategoricalDtype)
2141+
and pd.api.types.is_bool_dtype(self._original_target_dtype.categories)
2142+
)
2143+
)
2144+
21262145
def _sorted_prob_variable_names(self):
21272146
"""Returns the model probability variable names in the order of self.classes_"""
21282147
self._assert_is_fitted()
@@ -2227,8 +2246,13 @@ def _fit_training_post_process(self, ds):
22272246
for key in variable.meta_data.keys:
22282247
if key.startswith("TargetProb"):
22292248
self.classes_.append(variable.meta_data.get_value(key))
2230-
if ds.is_in_memory and self._is_real_target_dtype_integer():
2231-
self.classes_ = [int(class_value) for class_value in self.classes_]
2249+
if ds.is_in_memory:
2250+
if self._is_real_target_dtype_integer():
2251+
self.classes_ = [int(class_value) for class_value in self.classes_]
2252+
elif self._is_real_target_dtype_float():
2253+
self.classes_ = [float(class_value) for class_value in self.classes_]
2254+
elif self._is_real_target_dtype_bool():
2255+
self.classes_ = [class_value == "True" for class_value in self.classes_]
22322256
self.classes_.sort()
22332257
self.classes_ = column_or_1d(self.classes_)
22342258

@@ -2283,9 +2307,10 @@ def predict(self, X):
22832307
-------
22842308
`ndarray <numpy.ndarray>`
22852309
An array containing the encoded columns. A first column containing key
2286-
column ids is added in multi-table mode. The `numpy.dtype` of the array is
2287-
integer if the classifier was learned with an integer ``y``. Otherwise it
2288-
will be ``str``.
2310+
column ids is added in multi-table mode. The `numpy.dtype` of the array
2311+
matches the type of ``y`` used during training. It will be integer, float,
2312+
or boolean if the classifier was trained with a ``y`` of the corresponding
2313+
type. Otherwise it will be ``str``.
22892314
22902315
The key columns are added for multi-table tasks.
22912316
"""

0 commit comments

Comments
 (0)