Skip to content

Commit 9cec2f1

Browse files
authored
Merge pull request #462 from KhiopsML/39-support-text-types-in-sklearn-predictors
Add n_text_features and type_text features to Khiops sklearn supervised estimators
2 parents 111996c + 4556cf5 commit 9cec2f1

File tree

10 files changed

+359
-22
lines changed

10 files changed

+359
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
### Added
1212
- (`core`) Dictionary API support for dictionary, variable and variable block
1313
comments, and dictionary and variable block internal comments.
14+
- (`sklearn`) `Text` Khiops type support at the estimator level.
1415

1516
### Fixed
1617
- (General) Inconsistency between the `tools.download_datasets` function and the

doc/samples/samples_sklearn.rst

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,72 @@ Samples
148148
test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr")
149149
print(f"Test accuracy = {test_accuracy}")
150150
print(f"Test auc = {test_auc}")
151+
.. autofunction:: khiops_classifier_text
152+
.. code-block:: python
153+
154+
# Imports
155+
import os
156+
import pandas as pd
157+
from khiops import core as kh
158+
from khiops.sklearn import KhiopsClassifier
159+
from sklearn import metrics
160+
from sklearn.model_selection import train_test_split
161+
162+
# Load the dataset into a pandas dataframe
163+
data_table_path = os.path.join(
164+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
165+
)
166+
data_df = pd.read_csv(data_table_path, sep="\t")
167+
168+
# Split the whole dataframe into train and test (70%-30%)
169+
data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)
170+
171+
# Split the dataset into:
172+
# - the X feature table
173+
# - the y target vector ("negativereason" column)
174+
X_train = data_train_df.drop("negativereason", axis=1)
175+
X_test = data_test_df.drop("negativereason", axis=1)
176+
y_train = data_train_df["negativereason"]
177+
y_test = data_test_df["negativereason"]
178+
179+
# Set Pandas StringDType on the "text" column
180+
X_train["text"] = X_train["text"].astype("string")
181+
X_test["text"] = X_test["text"].astype("string")
182+
183+
# Create the classifier object
184+
khc = KhiopsClassifier()
185+
186+
# Train the classifier
187+
khc.fit(X_train, y_train)
188+
189+
# Show the feature importance info
190+
print(f"Features evaluated: {khc.n_features_evaluated_}")
191+
print(f"Features selected : {khc.n_features_used_}")
192+
print("Top 3 used features")
193+
for i, feature in enumerate(khc.feature_used_names_[:3]):
194+
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
195+
print("---")
196+
197+
# Predict the classes on the test dataset
198+
y_test_pred = khc.predict(X_test)
199+
print("Predicted classes (first 10):")
200+
print(y_test_pred[0:10])
201+
print("---")
202+
203+
# Predict the class probabilities on the test dataset
204+
y_test_probas = khc.predict_proba(X_test)
205+
print(f"Class order: {khc.classes_}")
206+
print("Predicted class probabilities (first 10):")
207+
print(y_test_probas[0:10])
208+
print("---")
209+
210+
# Evaluate the accuracy metric on the test dataset
211+
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
212+
print(f"Test accuracy = {test_accuracy}")
213+
214+
# If you have Khiops Visualization installed you may open the report as follows
215+
# khc.export_report_file("report.khj")
216+
# kh.visualize_report("report.khj")
151217
.. autofunction:: khiops_classifier_multitable_star
152218
.. code-block:: python
153219

khiops/samples/samples_sklearn.ipynb

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,85 @@
149149
"print(f\"Test auc = {test_auc}\")"
150150
]
151151
},
152+
{
153+
"cell_type": "markdown",
154+
"metadata": {},
155+
"source": [
156+
"### `khiops_classifier_text()`\n\n",
157+
"Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": null,
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"# Imports\n",
167+
"import os\n",
168+
"import pandas as pd\n",
169+
"from khiops import core as kh\n",
170+
"from khiops.sklearn import KhiopsClassifier\n",
171+
"from sklearn import metrics\n",
172+
"from sklearn.model_selection import train_test_split\n",
173+
"\n",
174+
"# Load the dataset into a pandas dataframe\n",
175+
"data_table_path = os.path.join(\n",
176+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
177+
")\n",
178+
"data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n",
179+
"\n",
180+
"# Split the whole dataframe into train and test (70%-30%)\n",
181+
"data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n",
182+
"\n",
183+
"# Split the dataset into:\n",
184+
"# - the X feature table\n",
185+
"# - the y target vector (\"negativereason\" column)\n",
186+
"X_train = data_train_df.drop(\"negativereason\", axis=1)\n",
187+
"X_test = data_test_df.drop(\"negativereason\", axis=1)\n",
188+
"y_train = data_train_df[\"negativereason\"]\n",
189+
"y_test = data_test_df[\"negativereason\"]\n",
190+
"\n",
191+
"# Set Pandas StringDType on the \"text\" column\n",
192+
"X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n",
193+
"X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n",
194+
"\n",
195+
"# Create the classifier object\n",
196+
"khc = KhiopsClassifier()\n",
197+
"\n",
198+
"# Train the classifier\n",
199+
"khc.fit(X_train, y_train)\n",
200+
"\n",
201+
"# Show the feature importance info\n",
202+
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
203+
"print(f\"Features selected : {khc.n_features_used_}\")\n",
204+
"print(\"Top 3 used features\")\n",
205+
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
206+
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
207+
"print(\"---\")\n",
208+
"\n",
209+
"# Predict the classes on the test dataset\n",
210+
"y_test_pred = khc.predict(X_test)\n",
211+
"print(\"Predicted classes (first 10):\")\n",
212+
"print(y_test_pred[0:10])\n",
213+
"print(\"---\")\n",
214+
"\n",
215+
"# Predict the class probabilities on the test dataset\n",
216+
"y_test_probas = khc.predict_proba(X_test)\n",
217+
"print(f\"Class order: {khc.classes_}\")\n",
218+
"print(\"Predicted class probabilities (first 10):\")\n",
219+
"print(y_test_probas[0:10])\n",
220+
"print(\"---\")\n",
221+
"\n",
222+
"# Evaluate the accuracy metric on the test dataset\n",
223+
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
224+
"print(f\"Test accuracy = {test_accuracy}\")\n",
225+
"\n",
226+
"# If you have Khiops Visualization installed you may open the report as follows\n",
227+
"# khc.export_report_file(\"report.khj\")\n",
228+
"# kh.visualize_report(\"report.khj\")"
229+
]
230+
},
152231
{
153232
"cell_type": "markdown",
154233
"metadata": {},

khiops/samples/samples_sklearn.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,75 @@ def khiops_classifier_multiclass():
142142
print(f"Test auc = {test_auc}")
143143

144144

145+
def khiops_classifier_text():
146+
"""Train a `.KhiopsClassifier` on a monotable dataframe with textual data"""
147+
# Imports
148+
import os
149+
import pandas as pd
150+
from khiops import core as kh
151+
from khiops.sklearn import KhiopsClassifier
152+
from sklearn import metrics
153+
from sklearn.model_selection import train_test_split
154+
155+
# Load the dataset into a pandas dataframe
156+
data_table_path = os.path.join(
157+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
158+
)
159+
data_df = pd.read_csv(data_table_path, sep="\t")
160+
161+
# Split the whole dataframe into train and test (70%-30%)
162+
data_train_df, data_test_df = train_test_split(
163+
data_df, test_size=0.3, random_state=1
164+
)
165+
166+
# Split the dataset into:
167+
# - the X feature table
168+
# - the y target vector ("negativereason" column)
169+
X_train = data_train_df.drop("negativereason", axis=1)
170+
X_test = data_test_df.drop("negativereason", axis=1)
171+
y_train = data_train_df["negativereason"]
172+
y_test = data_test_df["negativereason"]
173+
174+
# Set Pandas StringDType on the "text" column
175+
X_train["text"] = X_train["text"].astype("string")
176+
X_test["text"] = X_test["text"].astype("string")
177+
178+
# Create the classifier object
179+
khc = KhiopsClassifier()
180+
181+
# Train the classifier
182+
khc.fit(X_train, y_train)
183+
184+
# Show the feature importance info
185+
print(f"Features evaluated: {khc.n_features_evaluated_}")
186+
print(f"Features selected : {khc.n_features_used_}")
187+
print("Top 3 used features")
188+
for i, feature in enumerate(khc.feature_used_names_[:3]):
189+
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
190+
print("---")
191+
192+
# Predict the classes on the test dataset
193+
y_test_pred = khc.predict(X_test)
194+
print("Predicted classes (first 10):")
195+
print(y_test_pred[0:10])
196+
print("---")
197+
198+
# Predict the class probabilities on the test dataset
199+
y_test_probas = khc.predict_proba(X_test)
200+
print(f"Class order: {khc.classes_}")
201+
print("Predicted class probabilities (first 10):")
202+
print(y_test_probas[0:10])
203+
print("---")
204+
205+
# Evaluate the accuracy metric on the test dataset
206+
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
207+
print(f"Test accuracy = {test_accuracy}")
208+
209+
# If you have Khiops Visualization installed you may open the report as follows
210+
# khc.export_report_file("report.khj")
211+
# kh.visualize_report("report.khj")
212+
213+
145214
def khiops_classifier_multitable_star():
146215
"""Trains a `.KhiopsClassifier` on a star multi-table dataset"""
147216
# Imports
@@ -831,6 +900,7 @@ def khiops_coclustering_simplify():
831900
exported_samples = [
832901
khiops_classifier,
833902
khiops_classifier_multiclass,
903+
khiops_classifier_text,
834904
khiops_classifier_multitable_star,
835905
khiops_classifier_multitable_snowflake,
836906
khiops_classifier_sparse,

khiops/sklearn/dataset.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,19 +236,40 @@ def _upgrade_mapping_spec(ds_spec):
236236
return new_ds_spec
237237

238238

239-
def get_khiops_type(numpy_type):
239+
def get_khiops_type(numpy_type, categorical_str_max_size=None):
240240
"""Translates a numpy dtype to a Khiops dictionary type
241241
242242
Parameters
243243
----------
244-
numpy_type : `numpy.dtype`:
244+
numpy_type : `numpy.dtype`
245245
Numpy type of the column
246+
categorical_str_max_size : `int`, optional
247+
Maximum length of the entries of the column whose type is ``numpy_type``.
246248
247249
Returns
248250
-------
249251
str
250-
Khiops type name. Either "Categorical", "Numerical" or "Timestamp"
252+
Khiops type name. Either "Categorical", "Text", "Numerical" or "Timestamp".
253+
254+
.. note::
255+
The "Text" Khiops type is inferred if the Numpy type is "string"
256+
and the maximum length of the entries of that type is greater than 100.
257+
251258
"""
259+
# Check categorical_str_max_size type
260+
if categorical_str_max_size is not None and not isinstance(
261+
categorical_str_max_size, (int, np.int64)
262+
):
263+
raise TypeError(
264+
type_error_message(
265+
"categorical_str_max_size",
266+
categorical_str_max_size,
267+
int,
268+
np.int64,
269+
)
270+
)
271+
272+
# Get the Numpy dtype in lowercase
252273
lower_numpy_type = str(numpy_type).lower()
253274

254275
# timedelta64 and datetime64 types
@@ -257,6 +278,11 @@ def get_khiops_type(numpy_type):
257278
# float<x>, int<x>, uint<x> types
258279
elif "int" in lower_numpy_type or "float" in lower_numpy_type:
259280
khiops_type = "Numerical"
281+
elif lower_numpy_type == "string":
282+
if categorical_str_max_size is not None and categorical_str_max_size > 100:
283+
khiops_type = "Text"
284+
else:
285+
khiops_type = "Categorical"
260286
# bool_ and object, character, bytes_, str_, void, record and other types
261287
else:
262288
khiops_type = "Categorical"
@@ -956,10 +982,16 @@ def __init__(self, name, dataframe, key=None):
956982
)
957983

958984
# Initialize Khiops types
959-
self.khiops_types = {
960-
column_id: get_khiops_type(self.data_source.dtypes[column_id])
961-
for column_id in self.column_ids
962-
}
985+
self.khiops_types = {}
986+
for column_id in self.column_ids:
987+
column = self.data_source[column_id]
988+
column_numpy_type = column.dtype
989+
column_max_size = None
990+
if isinstance(column_numpy_type, pd.StringDtype):
991+
column_max_size = column.str.len().max()
992+
self.khiops_types[column_id] = get_khiops_type(
993+
column_numpy_type, column_max_size
994+
)
963995

964996
# Check key integrity
965997
self.check_key()

0 commit comments

Comments
 (0)