|
149 | 149 | "print(f\"Test auc = {test_auc}\")" |
150 | 150 | ] |
151 | 151 | }, |
| 152 | + { |
| 153 | + "cell_type": "markdown", |
| 154 | + "metadata": {}, |
| 155 | + "source": [ |
| 156 | + "### `khiops_classifier_text()`\n\n", |
| 157 | + "Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n" |
| 158 | + ] |
| 159 | + }, |
| 160 | + { |
| 161 | + "cell_type": "code", |
| 162 | + "execution_count": null, |
| 163 | + "metadata": {}, |
| 164 | + "outputs": [], |
| 165 | + "source": [ |
| 166 | + "# Imports\n", |
| 167 | + "import os\n", |
| 168 | + "import pandas as pd\n", |
| 169 | + "from khiops import core as kh\n", |
| 170 | + "from khiops.sklearn import KhiopsClassifier\n", |
| 171 | + "from sklearn import metrics\n", |
| 172 | + "from sklearn.model_selection import train_test_split\n", |
| 173 | + "\n", |
| 174 | + "# Load the dataset into a pandas dataframe\n", |
| 175 | + "data_table_path = os.path.join(\n", |
| 176 | + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n", |
| 177 | + ")\n", |
| 178 | + "data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n", |
| 179 | + "\n", |
| 180 | + "# Split the whole dataframe into train and test (70%-30%)\n", |
| 181 | + "data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n", |
| 182 | + "\n", |
| 183 | + "# Split the dataset into:\n", |
| 184 | + "# - the X feature table\n", |
| 185 | + "# - the y target vector (\"negativereason\" column)\n", |
| 186 | + "X_train = data_train_df.drop(\"negativereason\", axis=1)\n", |
| 187 | + "X_test = data_test_df.drop(\"negativereason\", axis=1)\n", |
| 188 | + "y_train = data_train_df[\"negativereason\"]\n", |
| 189 | + "y_test = data_test_df[\"negativereason\"]\n", |
| 190 | + "\n", |
| 191 | + "# Set Pandas StringDType on the \"text\" column\n", |
| 192 | + "X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n", |
| 193 | + "X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n", |
| 194 | + "\n", |
| 195 | + "# Create the classifier object\n", |
| 196 | + "khc = KhiopsClassifier()\n", |
| 197 | + "\n", |
| 198 | + "# Train the classifier\n", |
| 199 | + "khc.fit(X_train, y_train)\n", |
| 200 | + "\n", |
| 201 | + "# Show the feature importance info\n", |
| 202 | + "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", |
| 203 | + "print(f\"Features selected : {khc.n_features_used_}\")\n", |
| 204 | + "print(\"Top 3 used features\")\n", |
| 205 | + "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", |
| 206 | + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", |
| 207 | + "print(\"---\")\n", |
| 208 | + "\n", |
| 209 | + "# Predict the classes on the test dataset\n", |
| 210 | + "y_test_pred = khc.predict(X_test)\n", |
| 211 | + "print(\"Predicted classes (first 10):\")\n", |
| 212 | + "print(y_test_pred[0:10])\n", |
| 213 | + "print(\"---\")\n", |
| 214 | + "\n", |
| 215 | + "# Predict the class probabilities on the test dataset\n", |
| 216 | + "y_test_probas = khc.predict_proba(X_test)\n", |
| 217 | + "print(f\"Class order: {khc.classes_}\")\n", |
| 218 | + "print(\"Predicted class probabilities (first 10):\")\n", |
| 219 | + "print(y_test_probas[0:10])\n", |
| 220 | + "print(\"---\")\n", |
| 221 | + "\n", |
| 222 | + "# Evaluate the accuracy metric on the test dataset\n", |
| 223 | + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", |
| 224 | + "print(f\"Test accuracy = {test_accuracy}\")\n", |
| 225 | + "\n", |
| 226 | + "# If you have Khiops Visualization installed you may open the report as follows\n", |
| 227 | + "# khc.export_report_file(\"report.khj\")\n", |
| 228 | + "# kh.visualize_report(\"report.khj\")" |
| 229 | + ] |
| 230 | + }, |
152 | 231 | { |
153 | 232 | "cell_type": "markdown", |
154 | 233 | "metadata": {}, |
|
0 commit comments