|
972 | 972 | "print(X_clusters)\n", |
973 | 973 | "print(\"---\")" |
974 | 974 | ] |
975 | | - }, |
976 | | - { |
977 | | - "cell_type": "markdown", |
978 | | - "metadata": {}, |
979 | | - "source": [ |
980 | | - "### `khiops_classifier_multitable_list()`\n\n", |
981 | | - "Trains a KhiopsClassifier using a list dataset specification\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n \n" |
982 | | - ] |
983 | | - }, |
984 | | - { |
985 | | - "cell_type": "code", |
986 | | - "execution_count": null, |
987 | | - "metadata": {}, |
988 | | - "outputs": [], |
989 | | - "source": [ |
990 | | - "# Imports\n", |
991 | | - "import os\n", |
992 | | - "import pandas as pd\n", |
993 | | - "from khiops import core as kh\n", |
994 | | - "from khiops.sklearn import KhiopsClassifier\n", |
995 | | - "from sklearn import metrics\n", |
996 | | - "from sklearn.model_selection import train_test_split\n", |
997 | | - "\n", |
998 | | - "# Load the root table of the dataset into a pandas dataframe\n", |
999 | | - "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", |
1000 | | - "accidents_df = pd.read_csv(\n", |
1001 | | - " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", |
1002 | | - " sep=\"\\t\",\n", |
1003 | | - ")\n", |
1004 | | - "X = accidents_df.drop(\"Gravity\", axis=1)\n", |
1005 | | - "y = accidents_df[\"Gravity\"]\n", |
1006 | | - "\n", |
1007 | | - "# Split the dataset into train and test\n", |
1008 | | - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", |
1009 | | - "\n", |
1010 | | - "# Load the secondary table of the dataset into a pandas dataframe\n", |
1011 | | - "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", |
1012 | | - "\n", |
1013 | | - "# Split the secondary dataframe with the keys of the split root dataframe\n", |
1014 | | - "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", |
1015 | | - "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", |
1016 | | - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", |
1017 | | - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", |
1018 | | - "\n", |
1019 | | - "# Create the classifier specifying the key column name\n", |
1020 | | - "khc = KhiopsClassifier(key=\"AccidentId\")\n", |
1021 | | - "\n", |
1022 | | - "# Train the classifier\n", |
1023 | | - "khc.fit([X_train, X_train_secondary], y_train)\n", |
1024 | | - "\n", |
1025 | | - "# Predict the class on the test dataset\n", |
1026 | | - "y_test_pred = khc.predict([X_test, X_test_secondary])\n", |
1027 | | - "print(\"Predicted classes (first 10):\")\n", |
1028 | | - "print(y_test_pred[:10])\n", |
1029 | | - "print(\"---\")\n", |
1030 | | - "\n", |
1031 | | - "# Predict the class probability on the test dataset\n", |
1032 | | - "y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n", |
1033 | | - "print(\"Predicted class probabilities (first 10):\")\n", |
1034 | | - "print(y_test_probas[:10])\n", |
1035 | | - "print(\"---\")\n", |
1036 | | - "\n", |
1037 | | - "# Evaluate accuracy and auc metrics on the test dataset\n", |
1038 | | - "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", |
1039 | | - "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", |
1040 | | - "print(f\"Test accuracy = {test_accuracy}\")\n", |
1041 | | - "print(f\"Test auc = {test_auc}\")" |
1042 | | - ] |
1043 | | - }, |
1044 | | - { |
1045 | | - "cell_type": "markdown", |
1046 | | - "metadata": {}, |
1047 | | - "source": [ |
1048 | | - "### `khiops_classifier_multitable_star_file()`\n\n", |
1049 | | - "Trains a `.KhiopsClassifier` with a file path based dataset\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n If you need to handle large datasets that do not easily fit into memory then you\n may use the `~.khiops.core` API directly, which allows to specify file paths\n directly.\n \n" |
1050 | | - ] |
1051 | | - }, |
1052 | | - { |
1053 | | - "cell_type": "code", |
1054 | | - "execution_count": null, |
1055 | | - "metadata": {}, |
1056 | | - "outputs": [], |
1057 | | - "source": [ |
1058 | | - "# Imports\n", |
1059 | | - "import os\n", |
1060 | | - "import pandas as pd\n", |
1061 | | - "from khiops import core as kh\n", |
1062 | | - "from khiops.sklearn import KhiopsClassifier\n", |
1063 | | - "from sklearn import metrics\n", |
1064 | | - "from sklearn.model_selection import train_test_split\n", |
1065 | | - "\n", |
1066 | | - "# Create output directory\n", |
1067 | | - "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n", |
1068 | | - "if not os.path.exists(\"kh_samples\"):\n", |
1069 | | - " os.mkdir(\"kh_samples\")\n", |
1070 | | - " os.mkdir(results_dir)\n", |
1071 | | - "else:\n", |
1072 | | - " if not os.path.exists(results_dir):\n", |
1073 | | - " os.mkdir(results_dir)\n", |
1074 | | - "\n", |
1075 | | - "# Load the root table of the dataset into a pandas dataframe\n", |
1076 | | - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", |
1077 | | - "accidents_df = pd.read_csv(\n", |
1078 | | - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", |
1079 | | - " sep=\"\\t\",\n", |
1080 | | - ")\n", |
1081 | | - "\n", |
1082 | | - "# Split the root dataframe into train and test\n", |
1083 | | - "X_train_main, X_test_main = train_test_split(\n", |
1084 | | - " accidents_df, test_size=0.3, random_state=1\n", |
1085 | | - ")\n", |
1086 | | - "\n", |
1087 | | - "# Load the secondary table of the dataset into a pandas dataframe\n", |
1088 | | - "vehicles_df = pd.read_csv(\n", |
1089 | | - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", |
1090 | | - ")\n", |
1091 | | - "\n", |
1092 | | - "# Split the secondary dataframe with the keys of the split root dataframe\n", |
1093 | | - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", |
1094 | | - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", |
1095 | | - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", |
1096 | | - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", |
1097 | | - "\n", |
1098 | | - "# Write the train and test dataset sets to disk\n", |
1099 | | - "# For the test file we remove the target column from the main table\n", |
1100 | | - "X_train_main_path = os.path.join(results_dir, \"X_train_main.txt\")\n", |
1101 | | - "X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n", |
1102 | | - "X_train_secondary_path = os.path.join(results_dir, \"X_train_secondary.txt\")\n", |
1103 | | - "X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n", |
1104 | | - "X_test_main_path = os.path.join(results_dir, \"X_test_main.txt\")\n", |
1105 | | - "y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n", |
1106 | | - "X_test_main.drop(columns=\"Gravity\").to_csv(\n", |
1107 | | - " X_test_main_path, sep=\"\\t\", header=True, index=False\n", |
1108 | | - ")\n", |
1109 | | - "X_test_secondary_path = os.path.join(results_dir, \"X_test_secondary.txt\")\n", |
1110 | | - "X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n", |
1111 | | - "\n", |
1112 | | - "# Define the dictionary of train\n", |
1113 | | - "X_train = {\n", |
1114 | | - " \"main_table\": \"Accidents\",\n", |
1115 | | - " \"tables\": {\n", |
1116 | | - " \"Accidents\": (X_train_main_path, \"AccidentId\"),\n", |
1117 | | - " \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", |
1118 | | - " },\n", |
1119 | | - " \"format\": (\"\\t\", True),\n", |
1120 | | - "}\n", |
1121 | | - "X_test = {\n", |
1122 | | - " \"main_table\": \"Accidents\",\n", |
1123 | | - " \"tables\": {\n", |
1124 | | - " \"Accidents\": (X_test_main_path, \"AccidentId\"),\n", |
1125 | | - " \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", |
1126 | | - " },\n", |
1127 | | - " \"format\": (\"\\t\", True),\n", |
1128 | | - "}\n", |
1129 | | - "\n", |
1130 | | - "# Create the classifier and fit it\n", |
1131 | | - "khc = KhiopsClassifier(output_dir=results_dir)\n", |
1132 | | - "khc.fit(X_train, y=\"Gravity\")\n", |
1133 | | - "\n", |
1134 | | - "# Predict the class in addition to the class probabilities on the test dataset\n", |
1135 | | - "y_test_pred_path = khc.predict(X_test)\n", |
1136 | | - "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", |
1137 | | - "print(\"Predicted classes (first 10):\")\n", |
1138 | | - "print(y_test_pred[\"PredictedGravity\"].head(10))\n", |
1139 | | - "print(\"---\")\n", |
1140 | | - "\n", |
1141 | | - "y_test_probas_path = khc.predict_proba(X_test)\n", |
1142 | | - "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", |
1143 | | - "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", |
1144 | | - "print(\"Predicted class probabilities (first 10):\")\n", |
1145 | | - "print(y_test_probas[proba_columns].head(10))\n", |
1146 | | - "print(\"---\")\n", |
1147 | | - "\n", |
1148 | | - "# Evaluate accuracy and auc metrics on the test dataset\n", |
1149 | | - "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", |
1150 | | - "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n", |
1151 | | - "print(f\"Test accuracy = {test_accuracy}\")\n", |
1152 | | - "print(f\"Test auc = {test_auc}\")" |
1153 | | - ] |
1154 | 975 | } |
1155 | 976 | ], |
1156 | 977 | "metadata": {}, |
|
0 commit comments