Skip to content

Commit 19ebe3c

Browse files
committed
support float and boolean targets in KhiopsClassifier
1 parent 747d9c4 commit 19ebe3c

File tree

5 files changed

+288
-11
lines changed

5 files changed

+288
-11
lines changed

doc/samples/samples_sklearn.rst

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,95 @@ Samples
479479
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
480480
print(f"Test accuracy = {test_accuracy}")
481481
print(f"Test auc = {test_auc}")
482+
.. autofunction:: khiops_classifier_float_target
483+
.. code-block:: python
484+
485+
# Imports
486+
import os
487+
import pandas as pd
488+
from khiops import core as kh
489+
from khiops.sklearn import KhiopsClassifier
490+
from sklearn.model_selection import train_test_split
491+
492+
# Load the dataset into a pandas dataframe
493+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
494+
adult_df = pd.read_csv(adult_path, sep="\t")
495+
adult_df["class"] = adult_df["class"].replace({"less": 0.0, "more": 1.0})
496+
497+
# Split the whole dataframe into train and test (70%-30%)
498+
adult_train_df, adult_test_df = train_test_split(
499+
adult_df, test_size=0.3, random_state=1
500+
)
501+
502+
# Split the dataset into:
503+
# - the X feature table
504+
# - the y target vector ("class" column)
505+
X_train = adult_train_df.drop("class", axis=1)
506+
X_test = adult_test_df.drop("class", axis=1)
507+
y_train = adult_train_df["class"]
508+
509+
# Create the classifier object
510+
khc = KhiopsClassifier()
511+
512+
# Train the classifier
513+
khc.fit(X_train, y_train)
514+
# Predict the classes on the test dataset
515+
y_test_pred = khc.predict(X_test)
516+
print("Predicted classes (first 10):")
517+
print(y_test_pred[0:10])
518+
print("---")
519+
520+
# Predict the class probabilities on the test dataset
521+
y_test_probas = khc.predict_proba(X_test)
522+
print(f"Class order: {khc.classes_}")
523+
print("Predicted class probabilities (first 10):")
524+
print(y_test_probas[0:10])
525+
print("---")
526+
.. autofunction:: khiops_classifier_boolean_target
527+
.. code-block:: python
528+
529+
# Imports
530+
import os
531+
import pandas as pd
532+
from khiops import core as kh
533+
from khiops.sklearn import KhiopsClassifier
534+
from sklearn.model_selection import train_test_split
535+
536+
# Load the dataset into a pandas dataframe
537+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
538+
adult_df = pd.read_csv(adult_path, sep="\t")
539+
adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})
540+
541+
# Split the whole dataframe into train and test (70%-30%)
542+
adult_train_df, adult_test_df = train_test_split(
543+
adult_df, test_size=0.3, random_state=1
544+
)
545+
546+
# Split the dataset into:
547+
# - the X feature table
548+
# - the y target vector ("class" column)
549+
X_train = adult_train_df.drop("class", axis=1)
550+
X_test = adult_test_df.drop("class", axis=1)
551+
y_train = adult_train_df["class"]
552+
553+
# Create the classifier object
554+
khc = KhiopsClassifier()
555+
556+
# Train the classifier
557+
khc.fit(X_train, y_train)
558+
559+
# Predict the classes on the test dataset
560+
y_test_pred = khc.predict(X_test)
561+
print("Predicted classes (first 10):")
562+
print(y_test_pred[0:10])
563+
print("---")
564+
565+
# Predict the class probabilities on the test dataset
566+
y_test_probas = khc.predict_proba(X_test)
567+
print(f"Class order: {khc.classes_}")
568+
print("Predicted class probabilities (first 10):")
569+
print(y_test_probas[0:10])
570+
print("---")
482571
.. autofunction:: khiops_regressor
483572
.. code-block:: python
484573

khiops/samples/samples_sklearn.ipynb

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,121 @@
545545
"print(f\"Test auc = {test_auc}\")"
546546
]
547547
},
548+
{
549+
"cell_type": "markdown",
550+
"metadata": {},
551+
"source": [
552+
"### `khiops_classifier_float_target()`\n\n",
553+
"Trains a `.KhiopsClassifier` on a monotable dataframe\n with a float target\n"
554+
]
555+
},
556+
{
557+
"cell_type": "code",
558+
"execution_count": null,
559+
"metadata": {},
560+
"outputs": [],
561+
"source": [
562+
"# Imports\n",
563+
"import os\n",
564+
"import pandas as pd\n",
565+
"from khiops import core as kh\n",
566+
"from khiops.sklearn import KhiopsClassifier\n",
567+
"from sklearn.model_selection import train_test_split\n",
568+
"\n",
569+
"# Load the dataset into a pandas dataframe\n",
570+
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
571+
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
572+
"adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": 0.0, \"more\": 1.0})\n",
573+
"\n",
574+
"# Split the whole dataframe into train and test (70%-30%)\n",
575+
"adult_train_df, adult_test_df = train_test_split(\n",
576+
" adult_df, test_size=0.3, random_state=1\n",
577+
")\n",
578+
"\n",
579+
"# Split the dataset into:\n",
580+
"# - the X feature table\n",
581+
"# - the y target vector (\"class\" column)\n",
582+
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
583+
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
584+
"y_train = adult_train_df[\"class\"]\n",
585+
"\n",
586+
"# Create the classifier object\n",
587+
"khc = KhiopsClassifier()\n",
588+
"\n",
589+
"# Train the classifier\n",
590+
"khc.fit(X_train, y_train)\n",
591+
"# Predict the classes on the test dataset\n",
592+
"y_test_pred = khc.predict(X_test)\n",
593+
"print(\"Predicted classes (first 10):\")\n",
594+
"print(y_test_pred[0:10])\n",
595+
"print(\"---\")\n",
596+
"\n",
597+
"# Predict the class probabilities on the test dataset\n",
598+
"y_test_probas = khc.predict_proba(X_test)\n",
599+
"print(f\"Class order: {khc.classes_}\")\n",
600+
"print(\"Predicted class probabilities (first 10):\")\n",
601+
"print(y_test_probas[0:10])\n",
602+
"print(\"---\")"
603+
]
604+
},
605+
{
606+
"cell_type": "markdown",
607+
"metadata": {},
608+
"source": [
609+
"### `khiops_classifier_boolean_target()`\n\n",
610+
"Trains a `.KhiopsClassifier` on a monotable dataframe\n with a boolean target\n"
611+
]
612+
},
613+
{
614+
"cell_type": "code",
615+
"execution_count": null,
616+
"metadata": {},
617+
"outputs": [],
618+
"source": [
619+
"# Imports\n",
620+
"import os\n",
621+
"import pandas as pd\n",
622+
"from khiops import core as kh\n",
623+
"from khiops.sklearn import KhiopsClassifier\n",
624+
"from sklearn.model_selection import train_test_split\n",
625+
"\n",
626+
"# Load the dataset into a pandas dataframe\n",
627+
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
628+
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
629+
"adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": False, \"more\": True})\n",
630+
"\n",
631+
"# Split the whole dataframe into train and test (70%-30%)\n",
632+
"adult_train_df, adult_test_df = train_test_split(\n",
633+
" adult_df, test_size=0.3, random_state=1\n",
634+
")\n",
635+
"\n",
636+
"# Split the dataset into:\n",
637+
"# - the X feature table\n",
638+
"# - the y target vector (\"class\" column)\n",
639+
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
640+
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
641+
"y_train = adult_train_df[\"class\"]\n",
642+
"\n",
643+
"# Create the classifier object\n",
644+
"khc = KhiopsClassifier()\n",
645+
"\n",
646+
"# Train the classifier\n",
647+
"khc.fit(X_train, y_train)\n",
648+
"\n",
649+
"# Predict the classes on the test dataset\n",
650+
"y_test_pred = khc.predict(X_test)\n",
651+
"print(\"Predicted classes (first 10):\")\n",
652+
"print(y_test_pred[0:10])\n",
653+
"print(\"---\")\n",
654+
"\n",
655+
"# Predict the class probabilities on the test dataset\n",
656+
"y_test_probas = khc.predict_proba(X_test)\n",
657+
"print(f\"Class order: {khc.classes_}\")\n",
658+
"print(\"Predicted class probabilities (first 10):\")\n",
659+
"print(y_test_probas[0:10])\n",
660+
"print(\"---\")"
661+
]
662+
},
548663
{
549664
"cell_type": "markdown",
550665
"metadata": {},

khiops/samples/samples_sklearn.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,52 @@ def khiops_classifier_float_target():
119119
# Create the classifier object
120120
khc = KhiopsClassifier()
121121

122+
# Train the classifier
123+
khc.fit(X_train, y_train)
124+
# Predict the classes on the test dataset
125+
y_test_pred = khc.predict(X_test)
126+
print("Predicted classes (first 10):")
127+
print(y_test_pred[0:10])
128+
print("---")
129+
130+
# Predict the class probabilities on the test dataset
131+
y_test_probas = khc.predict_proba(X_test)
132+
print(f"Class order: {khc.classes_}")
133+
print("Predicted class probabilities (first 10):")
134+
print(y_test_probas[0:10])
135+
print("---")
136+
137+
138+
def khiops_classifier_boolean_target():
139+
"""Trains a `.KhiopsClassifier` on a monotable dataframe
140+
with a boolean target"""
141+
# Imports
142+
import os
143+
import pandas as pd
144+
from khiops import core as kh
145+
from khiops.sklearn import KhiopsClassifier
146+
from sklearn.model_selection import train_test_split
147+
148+
# Load the dataset into a pandas dataframe
149+
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
150+
adult_df = pd.read_csv(adult_path, sep="\t")
151+
adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})
152+
153+
# Split the whole dataframe into train and test (70%-30%)
154+
adult_train_df, adult_test_df = train_test_split(
155+
adult_df, test_size=0.3, random_state=1
156+
)
157+
158+
# Split the dataset into:
159+
# - the X feature table
160+
# - the y target vector ("class" column)
161+
X_train = adult_train_df.drop("class", axis=1)
162+
X_test = adult_test_df.drop("class", axis=1)
163+
y_train = adult_train_df["class"]
164+
165+
# Create the classifier object
166+
khc = KhiopsClassifier()
167+
122168
# Train the classifier
123169
khc.fit(X_train, y_train)
124170

@@ -1108,8 +1154,6 @@ def khiops_classifier_multitable_star_file():
11081154
print(f"Test auc = {test_auc}")
11091155

11101156

1111-
exported_samples = [khiops_classifier_float_target]
1112-
"""
11131157
exported_samples = [
11141158
khiops_classifier,
11151159
khiops_classifier_multiclass,
@@ -1118,6 +1162,8 @@ def khiops_classifier_multitable_star_file():
11181162
khiops_classifier_sparse,
11191163
khiops_classifier_pickle,
11201164
khiops_classifier_with_hyperparameters,
1165+
khiops_classifier_float_target,
1166+
khiops_classifier_boolean_target,
11211167
khiops_regressor,
11221168
khiops_encoder,
11231169
khiops_encoder_multitable_star,
@@ -1129,7 +1175,6 @@ def khiops_classifier_multitable_star_file():
11291175
khiops_classifier_multitable_list,
11301176
khiops_classifier_multitable_star_file,
11311177
]
1132-
"""
11331178

11341179

11351180
def execute_samples(args):

khiops/sklearn/estimators.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def _check_categorical_target_type(ds):
153153
or pd.api.types.is_string_dtype(ds.target_column.dtype)
154154
or pd.api.types.is_integer_dtype(ds.target_column.dtype)
155155
or pd.api.types.is_float_dtype(ds.target_column.dtype)
156+
or pd.api.types.is_bool_dtype(ds.target_column.dtype)
156157
):
157158
raise ValueError(
158159
f"'y' has invalid type '{ds.target_column_type}'. "
@@ -2093,6 +2094,24 @@ def _is_real_target_dtype_integer(self):
20932094
)
20942095
)
20952096

2097+
def _is_real_target_dtype_float(self):
2098+
return self._original_target_dtype is not None and (
2099+
pd.api.types.is_float_dtype(self._original_target_dtype)
2100+
or (
2101+
isinstance(self._original_target_dtype, pd.CategoricalDtype)
2102+
and pd.api.types.is_float_dtype(self._original_target_dtype.categories)
2103+
)
2104+
)
2105+
2106+
def _is_real_target_dtype_bool(self):
2107+
return self._original_target_dtype is not None and (
2108+
pd.api.types.is_bool_dtype(self._original_target_dtype)
2109+
or (
2110+
isinstance(self._original_target_dtype, pd.CategoricalDtype)
2111+
and pd.api.types.is_bool_dtype(self._original_target_dtype.categories)
2112+
)
2113+
)
2114+
20962115
def _sorted_prob_variable_names(self):
20972116
"""Returns the model probability variable names in the order of self.classes_"""
20982117
assert self.is_fitted_, "Model not fit yet"
@@ -2195,11 +2214,15 @@ def _fit_training_post_process(self, ds):
21952214
for key in variable.meta_data.keys:
21962215
if key.startswith("TargetProb"):
21972216
self.classes_.append(variable.meta_data.get_value(key))
2198-
if ds.is_in_memory and self._is_real_target_dtype_integer():
2199-
self.classes_ = [int(class_value) for class_value in self.classes_]
2217+
if ds.is_in_memory:
2218+
if self._is_real_target_dtype_integer():
2219+
self.classes_ = [int(class_value) for class_value in self.classes_]
2220+
elif self._is_real_target_dtype_float():
2221+
self.classes_ = [float(class_value) for class_value in self.classes_]
2222+
elif self._is_real_target_dtype_bool():
2223+
self.classes_ = [class_value == "True" for class_value in self.classes_]
22002224
self.classes_.sort()
22012225
self.classes_ = column_or_1d(self.classes_)
2202-
22032226
# Count number of classes
22042227
self.n_classes_ = len(self.classes_)
22052228

@@ -2273,10 +2296,10 @@ def predict(self, X):
22732296
self._original_target_dtype
22742297
):
22752298
y_pred = y_pred.astype(str, copy=False)
2276-
elif pd.api.types.is_float_dtype(self._original_target_type):
2277-
print(self._original_target_type)
2278-
y_pred = y_pred.astype(str, copy=False)
2279-
print(y_pred)
2299+
elif pd.api.types.is_float_dtype(self._original_target_dtype):
2300+
y_pred = y_pred.astype(float, copy=False)
2301+
elif pd.api.types.is_bool_dtype(self._original_target_dtype):
2302+
y_pred = y_pred.astype(bool, copy=False)
22802303
# If category first coerce the type to the categories' type
22812304
else:
22822305
assert isinstance(self._original_target_dtype, pd.CategoricalDtype), (

tests/test_sklearn_output_types.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,12 @@ def test_classifier_output_types(self):
5454
"""Test the KhiopsClassifier output types and classes of predict* methods"""
5555
X, y = create_iris()
5656
X_mt, X_sec_mt, _ = create_iris_mt()
57-
5857
fixtures = {
5958
"ys": {
6059
"int": y,
6160
"int binary": y.replace({0: 0, 1: 0, 2: 1}),
61+
"float": y.replace({0: 0.0, 1: 1.0, 2: 2.0}).astype(float),
62+
"bool": y.replace({0: True, 1: True, 2: False}),
6263
"string": y.replace({0: "se", 1: "vi", 2: "ve"}),
6364
"string binary": y.replace({0: "vi_or_se", 1: "vi_or_se", 2: "ve"}),
6465
"int as string": y.replace({0: "8", 1: "9", 2: "10"}),
@@ -69,6 +70,8 @@ def test_classifier_output_types(self):
6970
"y_type_check": {
7071
"int": pd.api.types.is_integer_dtype,
7172
"int binary": pd.api.types.is_integer_dtype,
73+
"float": pd.api.types.is_float_dtype,
74+
"bool": pd.api.types.is_bool_dtype,
7275
"string": pd.api.types.is_string_dtype,
7376
"string binary": pd.api.types.is_string_dtype,
7477
"int as string": pd.api.types.is_string_dtype,
@@ -79,6 +82,8 @@ def test_classifier_output_types(self):
7982
"expected_classes": {
8083
"int": column_or_1d([0, 1, 2]),
8184
"int binary": column_or_1d([0, 1]),
85+
"float": column_or_1d([0.0, 1.0, 2.0]),
86+
"bool": column_or_1d([False, True]),
8287
"string": column_or_1d(["se", "ve", "vi"]),
8388
"string binary": column_or_1d(["ve", "vi_or_se"]),
8489
"int as string": column_or_1d(["10", "8", "9"]),

0 commit comments

Comments
 (0)