|
86 | 86 | "# kh.visualize_report(\"report.khj\")" |
87 | 87 | ] |
88 | 88 | }, |
| 89 | + { |
| 90 | + "cell_type": "markdown", |
| 91 | + "metadata": {}, |
| 92 | + "source": [ |
| 93 | + "### `khiops_classifier_text()`\n\n", |
| 94 | + "Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n" |
| 95 | + ] |
| 96 | + }, |
| 97 | + { |
| 98 | + "cell_type": "code", |
| 99 | + "execution_count": null, |
| 100 | + "metadata": {}, |
| 101 | + "outputs": [], |
| 102 | + "source": [ |
| 103 | + "# Imports\n", |
| 104 | + "import os\n", |
| 105 | + "import pandas as pd\n", |
| 106 | + "from khiops import core as kh\n", |
| 107 | + "from khiops.sklearn import KhiopsClassifier\n", |
| 108 | + "from sklearn import metrics\n", |
| 109 | + "from sklearn.model_selection import train_test_split\n", |
| 110 | + "\n", |
| 111 | + "# Load the dataset into a pandas dataframe\n", |
| 112 | + "data_table_path = os.path.join(\n", |
| 113 | + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n", |
| 114 | + ")\n", |
| 115 | + "data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n", |
| 116 | + "\n", |
| 117 | + "# Split the whole dataframe into train and test (70%-30%)\n", |
| 118 | + "data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n", |
| 119 | + "\n", |
| 120 | + "# Split the dataset into:\n", |
| 121 | + "# - the X feature table\n", |
| 122 | + "# - the y target vector (\"negativereason\" column)\n", |
| 123 | + "X_train = data_train_df.drop(\"negativereason\", axis=1)\n", |
| 124 | + "X_test = data_test_df.drop(\"negativereason\", axis=1)\n", |
| 125 | + "y_train = data_train_df[\"negativereason\"]\n", |
| 126 | + "y_test = data_test_df[\"negativereason\"]\n", |
| 127 | + "\n", |
| 128 | + "# Set Pandas StringDType on the \"text\" column\n", |
| 129 | + "X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n", |
| 130 | + "X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n", |
| 131 | + "\n", |
| 132 | + "# Create the classifier object\n", |
| 133 | + "khc = KhiopsClassifier()\n", |
| 134 | + "\n", |
| 135 | + "# Train the classifier\n", |
| 136 | + "khc.fit(X_train, y_train)\n", |
| 137 | + "\n", |
| 138 | + "# Show the feature importance info\n", |
| 139 | + "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", |
| 140 | + "print(f\"Features selected : {khc.n_features_used_}\")\n", |
| 141 | + "print(\"Top 3 used features\")\n", |
| 142 | + "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", |
| 143 | + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", |
| 144 | + "print(\"---\")\n", |
| 145 | + "\n", |
| 146 | + "# Predict the classes on the test dataset\n", |
| 147 | + "y_test_pred = khc.predict(X_test)\n", |
| 148 | + "print(\"Predicted classes (first 10):\")\n", |
| 149 | + "print(y_test_pred[0:10])\n", |
| 150 | + "print(\"---\")\n", |
| 151 | + "\n", |
| 152 | + "# Predict the class probabilities on the test dataset\n", |
| 153 | + "y_test_probas = khc.predict_proba(X_test)\n", |
| 154 | + "print(f\"Class order: {khc.classes_}\")\n", |
| 155 | + "print(\"Predicted class probabilities (first 10):\")\n", |
| 156 | + "print(y_test_probas[0:10])\n", |
| 157 | + "print(\"---\")\n", |
| 158 | + "\n", |
| 159 | + "# Evaluate the accuracy metric on the test dataset\n", |
| 160 | + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", |
| 161 | + "print(f\"Test accuracy = {test_accuracy}\")\n", |
| 162 | + "\n", |
| 163 | + "# If you have Khiops Visualization installed you may open the report as follows\n", |
| 164 | + "# khc.export_report_file(\"report.khj\")\n", |
| 165 | + "# kh.visualize_report(\"report.khj\")" |
| 166 | + ] |
| 167 | + }, |
89 | 168 | { |
90 | 169 | "cell_type": "markdown", |
91 | 170 | "metadata": {}, |
|
0 commit comments