Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
### Added
- (`core`) Dictionary API support for dictionary, variable and variable block
comments, and dictionary and variable block internal comments.
- (`sklearn`) `Text` Khiops type support at the estimator level.

### Fixed
- (General) Inconsistency between the `tools.download_datasets` function and the
Expand Down
66 changes: 66 additions & 0 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,72 @@ Samples
test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr")
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc = {test_auc}")
.. autofunction:: khiops_classifier_text
.. code-block:: python

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
data_table_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
data_df = pd.read_csv(data_table_path, sep="\t")

# Split the whole dataframe into train and test (70%-30%)
data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)

# Split the dataset into:
# - the X feature table
# - the y target vector ("negativereason" column)
X_train = data_train_df.drop("negativereason", axis=1)
X_test = data_test_df.drop("negativereason", axis=1)
y_train = data_train_df["negativereason"]
y_test = data_test_df["negativereason"]

# Set Pandas StringDType on the "text" column
X_train["text"] = X_train["text"].astype("string")
X_test["text"] = X_test["text"].astype("string")

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("---")

# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")

# Evaluate the accuracy metric on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
print(f"Test accuracy = {test_accuracy}")

# If you have Khiops Visualization installed you may open the report as follows
# khc.export_report_file("report.khj")
# kh.visualize_report("report.khj")
.. autofunction:: khiops_classifier_multitable_star
.. code-block:: python

Expand Down
79 changes: 79 additions & 0 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,85 @@
"print(f\"Test auc = {test_auc}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `khiops_classifier_text()`\n\n",
"Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os\n",
"import pandas as pd\n",
"from khiops import core as kh\n",
"from khiops.sklearn import KhiopsClassifier\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the dataset into a pandas dataframe\n",
"data_table_path = os.path.join(\n",
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
")\n",
"data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n",
"\n",
"# Split the whole dataframe into train and test (70%-30%)\n",
"data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n",
"\n",
"# Split the dataset into:\n",
"# - the X feature table\n",
"# - the y target vector (\"negativereason\" column)\n",
"X_train = data_train_df.drop(\"negativereason\", axis=1)\n",
"X_test = data_test_df.drop(\"negativereason\", axis=1)\n",
"y_train = data_train_df[\"negativereason\"]\n",
"y_test = data_test_df[\"negativereason\"]\n",
"\n",
"# Set Pandas StringDType on the \"text\" column\n",
"X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n",
"X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n",
"\n",
"# Create the classifier object\n",
"khc = KhiopsClassifier()\n",
"\n",
"# Train the classifier\n",
"khc.fit(X_train, y_train)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
"print(f\"Features selected : {khc.n_features_used_}\")\n",
"print(\"Top 3 used features\")\n",
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
"print(\"---\")\n",
"\n",
"# Predict the classes on the test dataset\n",
"y_test_pred = khc.predict(X_test)\n",
"print(\"Predicted classes (first 10):\")\n",
"print(y_test_pred[0:10])\n",
"print(\"---\")\n",
"\n",
"# Predict the class probabilities on the test dataset\n",
"y_test_probas = khc.predict_proba(X_test)\n",
"print(f\"Class order: {khc.classes_}\")\n",
"print(\"Predicted class probabilities (first 10):\")\n",
"print(y_test_probas[0:10])\n",
"print(\"---\")\n",
"\n",
"# Evaluate the accuracy metric on the test dataset\n",
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
"print(f\"Test accuracy = {test_accuracy}\")\n",
"\n",
"# If you have Khiops Visualization installed you may open the report as follows\n",
"# khc.export_report_file(\"report.khj\")\n",
"# kh.visualize_report(\"report.khj\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
70 changes: 70 additions & 0 deletions khiops/samples/samples_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,75 @@ def khiops_classifier_multiclass():
print(f"Test auc = {test_auc}")


def khiops_classifier_text():
"""Train a `.KhiopsClassifier` on a monotable dataframe with textual data"""
# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
data_table_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
data_df = pd.read_csv(data_table_path, sep="\t")

# Split the whole dataframe into train and test (70%-30%)
data_train_df, data_test_df = train_test_split(
data_df, test_size=0.3, random_state=1
)

# Split the dataset into:
# - the X feature table
# - the y target vector ("negativereason" column)
X_train = data_train_df.drop("negativereason", axis=1)
X_test = data_test_df.drop("negativereason", axis=1)
y_train = data_train_df["negativereason"]
y_test = data_test_df["negativereason"]

# Set Pandas StringDType on the "text" column
X_train["text"] = X_train["text"].astype("string")
X_test["text"] = X_test["text"].astype("string")

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("---")

# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")

# Evaluate the accuracy metric on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
print(f"Test accuracy = {test_accuracy}")

# If you have Khiops Visualization installed you may open the report as follows
# khc.export_report_file("report.khj")
# kh.visualize_report("report.khj")


def khiops_classifier_multitable_star():
"""Trains a `.KhiopsClassifier` on a star multi-table dataset"""
# Imports
Expand Down Expand Up @@ -831,6 +900,7 @@ def khiops_coclustering_simplify():
exported_samples = [
khiops_classifier,
khiops_classifier_multiclass,
khiops_classifier_text,
khiops_classifier_multitable_star,
khiops_classifier_multitable_snowflake,
khiops_classifier_sparse,
Expand Down
46 changes: 39 additions & 7 deletions khiops/sklearn/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,19 +236,40 @@ def _upgrade_mapping_spec(ds_spec):
return new_ds_spec


def get_khiops_type(numpy_type):
def get_khiops_type(numpy_type, categorical_str_max_size=None):
"""Translates a numpy dtype to a Khiops dictionary type

Parameters
----------
numpy_type : `numpy.dtype`:
numpy_type : `numpy.dtype`
Numpy type of the column
categorical_str_max_size : `int`, optional
Maximum length of the entries of the column whose type is ``numpy_type``.

Returns
-------
str
Khiops type name. Either "Categorical", "Numerical" or "Timestamp"
Khiops type name. Either "Categorical", "Text", "Numerical" or "Timestamp".

.. note::
The "Text" Khiops type is inferred if the Numpy type is "string"
and the maximum length of the entries of that type is greater than 100.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe worth defining a constant in the file (and referred here) because the value of 100 is arbitrary and could be changed in the future.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This value is identical to the Khiops Core value which servers the same purpose (as explained in the commit message) AFAIU.


"""
# Check categorical_str_max_size type
if categorical_str_max_size is not None and not isinstance(
categorical_str_max_size, (int, np.int64)
):
raise TypeError(
type_error_message(
"categorical_str_max_size",
categorical_str_max_size,
int,
np.int64,
)
)

# Get the Numpy dtype in lowercase
lower_numpy_type = str(numpy_type).lower()

# timedelta64 and datetime64 types
Expand All @@ -257,6 +278,11 @@ def get_khiops_type(numpy_type):
# float<x>, int<x>, uint<x> types
elif "int" in lower_numpy_type or "float" in lower_numpy_type:
khiops_type = "Numerical"
elif lower_numpy_type == "string":
if categorical_str_max_size is not None and categorical_str_max_size > 100:
khiops_type = "Text"
else:
khiops_type = "Categorical"
# bool_ and object, character, bytes_, str_, void, record and other types
else:
khiops_type = "Categorical"
Expand Down Expand Up @@ -956,10 +982,16 @@ def __init__(self, name, dataframe, key=None):
)

# Initialize Khiops types
self.khiops_types = {
column_id: get_khiops_type(self.data_source.dtypes[column_id])
for column_id in self.column_ids
}
self.khiops_types = {}
for column_id in self.column_ids:
column = self.data_source[column_id]
column_numpy_type = column.dtype
column_max_size = None
if isinstance(column_numpy_type, pd.StringDtype):
column_max_size = column.str.len().max()
self.khiops_types[column_id] = get_khiops_type(
column_numpy_type, column_max_size
)

# Check key integrity
self.check_key()
Expand Down
Loading