Skip to content

Commit 1ebfa10

Browse files
committed
Add Text data Sklearn sample
1 parent cea9546 commit 1ebfa10

File tree

3 files changed

+215
-0
lines changed

3 files changed

+215
-0
lines changed

doc/samples/samples_sklearn.rst

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,72 @@ Samples
9595
print(f"Test accuracy = {test_accuracy}")
9696
print(f"Test auc = {test_auc}")
9797
98+
# If you have Khiops Visualization installed you may open the report as follows
99+
# khc.export_report_file("report.khj")
100+
# kh.visualize_report("report.khj")
101+
.. autofunction:: khiops_classifier_text
102+
.. code-block:: python
103+
104+
# Imports
105+
import os
106+
import pandas as pd
107+
from khiops import core as kh
108+
from khiops.sklearn import KhiopsClassifier
109+
from sklearn import metrics
110+
from sklearn.model_selection import train_test_split
111+
112+
# Load the dataset into a pandas dataframe
113+
data_table_path = os.path.join(
114+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
115+
)
116+
data_df = pd.read_csv(data_table_path, sep="\t")
117+
118+
# Split the whole dataframe into train and test (70%-30%)
119+
data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)
120+
121+
# Split the dataset into:
122+
# - the X feature table
123+
# - the y target vector ("negativereason" column)
124+
X_train = data_train_df.drop("negativereason", axis=1)
125+
X_test = data_test_df.drop("negativereason", axis=1)
126+
y_train = data_train_df["negativereason"]
127+
y_test = data_test_df["negativereason"]
128+
129+
# Set Pandas StringDType on the "text" column
130+
X_train["text"] = X_train["text"].astype("string")
131+
X_test["text"] = X_test["text"].astype("string")
132+
133+
# Create the classifier object
134+
khc = KhiopsClassifier()
135+
136+
# Train the classifier
137+
khc.fit(X_train, y_train)
138+
139+
# Show the feature importance info
140+
print(f"Features evaluated: {khc.n_features_evaluated_}")
141+
print(f"Features selected : {khc.n_features_used_}")
142+
print("Top 3 used features")
143+
for i, feature in enumerate(khc.feature_used_names_[:3]):
144+
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
145+
print("---")
146+
147+
# Predict the classes on the test dataset
148+
y_test_pred = khc.predict(X_test)
149+
print("Predicted classes (first 10):")
150+
print(y_test_pred[0:10])
151+
print("---")
152+
153+
# Predict the class probabilities on the test dataset
154+
y_test_probas = khc.predict_proba(X_test)
155+
print(f"Class order: {khc.classes_}")
156+
print("Predicted class probabilities (first 10):")
157+
print(y_test_probas[0:10])
158+
print("---")
159+
160+
# Evaluate the accuracy metric on the test dataset
161+
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
162+
print(f"Test accuracy = {test_accuracy}")
163+
98164
# If you have Khiops Visualization installed you may open the report as follows
99165
# khc.export_report_file("report.khj")
100166
# kh.visualize_report("report.khj")

khiops/samples/samples_sklearn.ipynb

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,85 @@
8686
"# kh.visualize_report(\"report.khj\")"
8787
]
8888
},
89+
{
90+
"cell_type": "markdown",
91+
"metadata": {},
92+
"source": [
93+
"### `khiops_classifier_text()`\n\n",
94+
"Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": [
103+
"# Imports\n",
104+
"import os\n",
105+
"import pandas as pd\n",
106+
"from khiops import core as kh\n",
107+
"from khiops.sklearn import KhiopsClassifier\n",
108+
"from sklearn import metrics\n",
109+
"from sklearn.model_selection import train_test_split\n",
110+
"\n",
111+
"# Load the dataset into a pandas dataframe\n",
112+
"data_table_path = os.path.join(\n",
113+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
114+
")\n",
115+
"data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n",
116+
"\n",
117+
"# Split the whole dataframe into train and test (70%-30%)\n",
118+
"data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n",
119+
"\n",
120+
"# Split the dataset into:\n",
121+
"# - the X feature table\n",
122+
"# - the y target vector (\"negativereason\" column)\n",
123+
"X_train = data_train_df.drop(\"negativereason\", axis=1)\n",
124+
"X_test = data_test_df.drop(\"negativereason\", axis=1)\n",
125+
"y_train = data_train_df[\"negativereason\"]\n",
126+
"y_test = data_test_df[\"negativereason\"]\n",
127+
"\n",
128+
"# Set Pandas StringDType on the \"text\" column\n",
129+
"X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n",
130+
"X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n",
131+
"\n",
132+
"# Create the classifier object\n",
133+
"khc = KhiopsClassifier()\n",
134+
"\n",
135+
"# Train the classifier\n",
136+
"khc.fit(X_train, y_train)\n",
137+
"\n",
138+
"# Show the feature importance info\n",
139+
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
140+
"print(f\"Features selected : {khc.n_features_used_}\")\n",
141+
"print(\"Top 3 used features\")\n",
142+
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
143+
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
144+
"print(\"---\")\n",
145+
"\n",
146+
"# Predict the classes on the test dataset\n",
147+
"y_test_pred = khc.predict(X_test)\n",
148+
"print(\"Predicted classes (first 10):\")\n",
149+
"print(y_test_pred[0:10])\n",
150+
"print(\"---\")\n",
151+
"\n",
152+
"# Predict the class probabilities on the test dataset\n",
153+
"y_test_probas = khc.predict_proba(X_test)\n",
154+
"print(f\"Class order: {khc.classes_}\")\n",
155+
"print(\"Predicted class probabilities (first 10):\")\n",
156+
"print(y_test_probas[0:10])\n",
157+
"print(\"---\")\n",
158+
"\n",
159+
"# Evaluate the accuracy metric on the test dataset\n",
160+
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
161+
"print(f\"Test accuracy = {test_accuracy}\")\n",
162+
"\n",
163+
"# If you have Khiops Visualization installed you may open the report as follows\n",
164+
"# khc.export_report_file(\"report.khj\")\n",
165+
"# kh.visualize_report(\"report.khj\")"
166+
]
167+
},
89168
{
90169
"cell_type": "markdown",
91170
"metadata": {},

khiops/samples/samples_sklearn.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,75 @@
2424
# pylint: disable=import-outside-toplevel,redefined-outer-name,reimported
2525

2626

27+
def khiops_classifier_text():
28+
"""Train a `.KhiopsClassifier` on a monotable dataframe with textual data"""
29+
# Imports
30+
import os
31+
import pandas as pd
32+
from khiops import core as kh
33+
from khiops.sklearn import KhiopsClassifier
34+
from sklearn import metrics
35+
from sklearn.model_selection import train_test_split
36+
37+
# Load the dataset into a pandas dataframe
38+
data_table_path = os.path.join(
39+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
40+
)
41+
data_df = pd.read_csv(data_table_path, sep="\t")
42+
43+
# Split the whole dataframe into train and test (70%-30%)
44+
data_train_df, data_test_df = train_test_split(
45+
data_df, test_size=0.3, random_state=1
46+
)
47+
48+
# Split the dataset into:
49+
# - the X feature table
50+
# - the y target vector ("negativereason" column)
51+
X_train = data_train_df.drop("negativereason", axis=1)
52+
X_test = data_test_df.drop("negativereason", axis=1)
53+
y_train = data_train_df["negativereason"]
54+
y_test = data_test_df["negativereason"]
55+
56+
# Set Pandas StringDType on the "text" column
57+
X_train["text"] = X_train["text"].astype("string")
58+
X_test["text"] = X_test["text"].astype("string")
59+
60+
# Create the classifier object
61+
khc = KhiopsClassifier()
62+
63+
# Train the classifier
64+
khc.fit(X_train, y_train)
65+
66+
# Show the feature importance info
67+
print(f"Features evaluated: {khc.n_features_evaluated_}")
68+
print(f"Features selected : {khc.n_features_used_}")
69+
print("Top 3 used features")
70+
for i, feature in enumerate(khc.feature_used_names_[:3]):
71+
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
72+
print("---")
73+
74+
# Predict the classes on the test dataset
75+
y_test_pred = khc.predict(X_test)
76+
print("Predicted classes (first 10):")
77+
print(y_test_pred[0:10])
78+
print("---")
79+
80+
# Predict the class probabilities on the test dataset
81+
y_test_probas = khc.predict_proba(X_test)
82+
print(f"Class order: {khc.classes_}")
83+
print("Predicted class probabilities (first 10):")
84+
print(y_test_probas[0:10])
85+
print("---")
86+
87+
# Evaluate the accuracy metric on the test dataset
88+
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
89+
print(f"Test accuracy = {test_accuracy}")
90+
91+
# If you have Khiops Visualization installed you may open the report as follows
92+
# khc.export_report_file("report.khj")
93+
# kh.visualize_report("report.khj")
94+
95+
2796
def khiops_classifier():
2897
"""Trains a `.KhiopsClassifier` on a monotable dataframe"""
2998
# Imports
@@ -830,6 +899,7 @@ def khiops_coclustering_simplify():
830899

831900
exported_samples = [
832901
khiops_classifier,
902+
khiops_classifier_text,
833903
khiops_classifier_multiclass,
834904
khiops_classifier_multitable_star,
835905
khiops_classifier_multitable_snowflake,

0 commit comments

Comments
 (0)