Skip to content

Commit c43c94a

Browse files
committed
Update sklearn code to v11
- update Core API calls to the v11 Core API - drop all sklearn-specific deprecated Khiops estimator fields - drop deprecated `key` dataset attribute - take into account dataset deprecation drops - drop deprecated sklearn samples
1 parent 3b0fdb2 commit c43c94a

File tree

8 files changed

+51
-878
lines changed

8 files changed

+51
-878
lines changed

doc/samples/samples_sklearn.rst

Lines changed: 0 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -803,156 +803,3 @@ Samples
803803
print("Predicted clusters (only three at most)")
804804
print(X_clusters)
805805
print("---")
806-
.. autofunction:: khiops_classifier_multitable_list
807-
.. code-block:: python
808-
809-
# Imports
810-
import os
811-
import pandas as pd
812-
from khiops import core as kh
813-
from khiops.sklearn import KhiopsClassifier
814-
from sklearn import metrics
815-
from sklearn.model_selection import train_test_split
816-
817-
# Load the root table of the dataset into a pandas dataframe
818-
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
819-
accidents_df = pd.read_csv(
820-
os.path.join(accidents_data_dir, "Accidents.txt"),
821-
sep="\t",
822-
)
823-
X = accidents_df.drop("Gravity", axis=1)
824-
y = accidents_df["Gravity"]
825-
826-
# Split the dataset into train and test
827-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
828-
829-
# Load the secondary table of the dataset into a pandas dataframe
830-
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
831-
832-
# Split the secondary dataframe with the keys of the split root dataframe
833-
X_train_ids = X_train["AccidentId"].to_frame()
834-
X_test_ids = X_test["AccidentId"].to_frame()
835-
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
836-
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")
837-
838-
# Create the classifier specifying the key column name
839-
khc = KhiopsClassifier(key="AccidentId")
840-
841-
# Train the classifier
842-
khc.fit([X_train, X_train_secondary], y_train)
843-
844-
# Predict the class on the test dataset
845-
y_test_pred = khc.predict([X_test, X_test_secondary])
846-
print("Predicted classes (first 10):")
847-
print(y_test_pred[:10])
848-
print("---")
849-
850-
# Predict the class probability on the test dataset
851-
y_test_probas = khc.predict_proba([X_test, X_test_secondary])
852-
print("Predicted class probabilities (first 10):")
853-
print(y_test_probas[:10])
854-
print("---")
855-
856-
# Evaluate accuracy and auc metrics on the test dataset
857-
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
858-
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
859-
print(f"Test accuracy = {test_accuracy}")
860-
print(f"Test auc = {test_auc}")
861-
.. autofunction:: khiops_classifier_multitable_star_file
862-
.. code-block:: python
863-
864-
# Imports
865-
import os
866-
import pandas as pd
867-
from khiops import core as kh
868-
from khiops.sklearn import KhiopsClassifier
869-
from sklearn import metrics
870-
from sklearn.model_selection import train_test_split
871-
872-
# Create output directory
873-
results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file")
874-
if not os.path.exists("kh_samples"):
875-
os.mkdir("kh_samples")
876-
os.mkdir(results_dir)
877-
else:
878-
if not os.path.exists(results_dir):
879-
os.mkdir(results_dir)
880-
881-
# Load the root table of the dataset into a pandas dataframe
882-
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
883-
accidents_df = pd.read_csv(
884-
os.path.join(accidents_dataset_path, "Accidents.txt"),
885-
sep="\t",
886-
)
887-
888-
# Split the root dataframe into train and test
889-
X_train_main, X_test_main = train_test_split(
890-
accidents_df, test_size=0.3, random_state=1
891-
)
892-
893-
# Load the secondary table of the dataset into a pandas dataframe
894-
vehicles_df = pd.read_csv(
895-
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
896-
)
897-
898-
# Split the secondary dataframe with the keys of the split root dataframe
899-
X_train_ids = X_train_main["AccidentId"].to_frame()
900-
X_test_ids = X_test_main["AccidentId"].to_frame()
901-
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
902-
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")
903-
904-
# Write the train and test dataset sets to disk
905-
# For the test file we remove the target column from the main table
906-
X_train_main_path = os.path.join(results_dir, "X_train_main.txt")
907-
X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False)
908-
X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt")
909-
X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False)
910-
X_test_main_path = os.path.join(results_dir, "X_test_main.txt")
911-
y_test = X_test_main.sort_values("AccidentId")["Gravity"]
912-
X_test_main.drop(columns="Gravity").to_csv(
913-
X_test_main_path, sep="\t", header=True, index=False
914-
)
915-
X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt")
916-
X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False)
917-
918-
# Define the dictionary of train
919-
X_train = {
920-
"main_table": "Accidents",
921-
"tables": {
922-
"Accidents": (X_train_main_path, "AccidentId"),
923-
"Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]),
924-
},
925-
"format": ("\t", True),
926-
}
927-
X_test = {
928-
"main_table": "Accidents",
929-
"tables": {
930-
"Accidents": (X_test_main_path, "AccidentId"),
931-
"Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]),
932-
},
933-
"format": ("\t", True),
934-
}
935-
936-
# Create the classifier and fit it
937-
khc = KhiopsClassifier(output_dir=results_dir)
938-
khc.fit(X_train, y="Gravity")
939-
940-
# Predict the class in addition to the class probabilities on the test dataset
941-
y_test_pred_path = khc.predict(X_test)
942-
y_test_pred = pd.read_csv(y_test_pred_path, sep="\t")
943-
print("Predicted classes (first 10):")
944-
print(y_test_pred["PredictedGravity"].head(10))
945-
print("---")
946-
947-
y_test_probas_path = khc.predict_proba(X_test)
948-
y_test_probas = pd.read_csv(y_test_probas_path, sep="\t")
949-
proba_columns = [col for col in y_test_probas if col.startswith("Prob")]
950-
print("Predicted class probabilities (first 10):")
951-
print(y_test_probas[proba_columns].head(10))
952-
print("---")
953-
954-
# Evaluate accuracy and auc metrics on the test dataset
955-
test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"])
956-
test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"])
957-
print(f"Test accuracy = {test_accuracy}")
958-
print(f"Test auc = {test_auc}")

khiops/samples/samples_sklearn.ipynb

Lines changed: 0 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -972,185 +972,6 @@
972972
"print(X_clusters)\n",
973973
"print(\"---\")"
974974
]
975-
},
976-
{
977-
"cell_type": "markdown",
978-
"metadata": {},
979-
"source": [
980-
"### `khiops_classifier_multitable_list()`\n\n",
981-
"Trains a KhiopsClassifier using a list dataset specification\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n \n"
982-
]
983-
},
984-
{
985-
"cell_type": "code",
986-
"execution_count": null,
987-
"metadata": {},
988-
"outputs": [],
989-
"source": [
990-
"# Imports\n",
991-
"import os\n",
992-
"import pandas as pd\n",
993-
"from khiops import core as kh\n",
994-
"from khiops.sklearn import KhiopsClassifier\n",
995-
"from sklearn import metrics\n",
996-
"from sklearn.model_selection import train_test_split\n",
997-
"\n",
998-
"# Load the root table of the dataset into a pandas dataframe\n",
999-
"accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
1000-
"accidents_df = pd.read_csv(\n",
1001-
" os.path.join(accidents_data_dir, \"Accidents.txt\"),\n",
1002-
" sep=\"\\t\",\n",
1003-
")\n",
1004-
"X = accidents_df.drop(\"Gravity\", axis=1)\n",
1005-
"y = accidents_df[\"Gravity\"]\n",
1006-
"\n",
1007-
"# Split the dataset into train and test\n",
1008-
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n",
1009-
"\n",
1010-
"# Load the secondary table of the dataset into a pandas dataframe\n",
1011-
"vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n",
1012-
"\n",
1013-
"# Split the secondary dataframe with the keys of the split root dataframe\n",
1014-
"X_train_ids = X_train[\"AccidentId\"].to_frame()\n",
1015-
"X_test_ids = X_test[\"AccidentId\"].to_frame()\n",
1016-
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
1017-
"X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n",
1018-
"\n",
1019-
"# Create the classifier specifying the key column name\n",
1020-
"khc = KhiopsClassifier(key=\"AccidentId\")\n",
1021-
"\n",
1022-
"# Train the classifier\n",
1023-
"khc.fit([X_train, X_train_secondary], y_train)\n",
1024-
"\n",
1025-
"# Predict the class on the test dataset\n",
1026-
"y_test_pred = khc.predict([X_test, X_test_secondary])\n",
1027-
"print(\"Predicted classes (first 10):\")\n",
1028-
"print(y_test_pred[:10])\n",
1029-
"print(\"---\")\n",
1030-
"\n",
1031-
"# Predict the class probability on the test dataset\n",
1032-
"y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n",
1033-
"print(\"Predicted class probabilities (first 10):\")\n",
1034-
"print(y_test_probas[:10])\n",
1035-
"print(\"---\")\n",
1036-
"\n",
1037-
"# Evaluate accuracy and auc metrics on the test dataset\n",
1038-
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
1039-
"test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n",
1040-
"print(f\"Test accuracy = {test_accuracy}\")\n",
1041-
"print(f\"Test auc = {test_auc}\")"
1042-
]
1043-
},
1044-
{
1045-
"cell_type": "markdown",
1046-
"metadata": {},
1047-
"source": [
1048-
"### `khiops_classifier_multitable_star_file()`\n\n",
1049-
"Trains a `.KhiopsClassifier` with a file path based dataset\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n If you need to handle large datasets that do not easily fit into memory then you\n may use the `~.khiops.core` API directly, which allows to specify file paths\n directly.\n \n"
1050-
]
1051-
},
1052-
{
1053-
"cell_type": "code",
1054-
"execution_count": null,
1055-
"metadata": {},
1056-
"outputs": [],
1057-
"source": [
1058-
"# Imports\n",
1059-
"import os\n",
1060-
"import pandas as pd\n",
1061-
"from khiops import core as kh\n",
1062-
"from khiops.sklearn import KhiopsClassifier\n",
1063-
"from sklearn import metrics\n",
1064-
"from sklearn.model_selection import train_test_split\n",
1065-
"\n",
1066-
"# Create output directory\n",
1067-
"results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n",
1068-
"if not os.path.exists(\"kh_samples\"):\n",
1069-
" os.mkdir(\"kh_samples\")\n",
1070-
" os.mkdir(results_dir)\n",
1071-
"else:\n",
1072-
" if not os.path.exists(results_dir):\n",
1073-
" os.mkdir(results_dir)\n",
1074-
"\n",
1075-
"# Load the root table of the dataset into a pandas dataframe\n",
1076-
"accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
1077-
"accidents_df = pd.read_csv(\n",
1078-
" os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n",
1079-
" sep=\"\\t\",\n",
1080-
")\n",
1081-
"\n",
1082-
"# Split the root dataframe into train and test\n",
1083-
"X_train_main, X_test_main = train_test_split(\n",
1084-
" accidents_df, test_size=0.3, random_state=1\n",
1085-
")\n",
1086-
"\n",
1087-
"# Load the secondary table of the dataset into a pandas dataframe\n",
1088-
"vehicles_df = pd.read_csv(\n",
1089-
" os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
1090-
")\n",
1091-
"\n",
1092-
"# Split the secondary dataframe with the keys of the split root dataframe\n",
1093-
"X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
1094-
"X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
1095-
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
1096-
"X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n",
1097-
"\n",
1098-
"# Write the train and test dataset sets to disk\n",
1099-
"# For the test file we remove the target column from the main table\n",
1100-
"X_train_main_path = os.path.join(results_dir, \"X_train_main.txt\")\n",
1101-
"X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n",
1102-
"X_train_secondary_path = os.path.join(results_dir, \"X_train_secondary.txt\")\n",
1103-
"X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n",
1104-
"X_test_main_path = os.path.join(results_dir, \"X_test_main.txt\")\n",
1105-
"y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n",
1106-
"X_test_main.drop(columns=\"Gravity\").to_csv(\n",
1107-
" X_test_main_path, sep=\"\\t\", header=True, index=False\n",
1108-
")\n",
1109-
"X_test_secondary_path = os.path.join(results_dir, \"X_test_secondary.txt\")\n",
1110-
"X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n",
1111-
"\n",
1112-
"# Define the dictionary of train\n",
1113-
"X_train = {\n",
1114-
" \"main_table\": \"Accidents\",\n",
1115-
" \"tables\": {\n",
1116-
" \"Accidents\": (X_train_main_path, \"AccidentId\"),\n",
1117-
" \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n",
1118-
" },\n",
1119-
" \"format\": (\"\\t\", True),\n",
1120-
"}\n",
1121-
"X_test = {\n",
1122-
" \"main_table\": \"Accidents\",\n",
1123-
" \"tables\": {\n",
1124-
" \"Accidents\": (X_test_main_path, \"AccidentId\"),\n",
1125-
" \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n",
1126-
" },\n",
1127-
" \"format\": (\"\\t\", True),\n",
1128-
"}\n",
1129-
"\n",
1130-
"# Create the classifier and fit it\n",
1131-
"khc = KhiopsClassifier(output_dir=results_dir)\n",
1132-
"khc.fit(X_train, y=\"Gravity\")\n",
1133-
"\n",
1134-
"# Predict the class in addition to the class probabilities on the test dataset\n",
1135-
"y_test_pred_path = khc.predict(X_test)\n",
1136-
"y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n",
1137-
"print(\"Predicted classes (first 10):\")\n",
1138-
"print(y_test_pred[\"PredictedGravity\"].head(10))\n",
1139-
"print(\"---\")\n",
1140-
"\n",
1141-
"y_test_probas_path = khc.predict_proba(X_test)\n",
1142-
"y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n",
1143-
"proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n",
1144-
"print(\"Predicted class probabilities (first 10):\")\n",
1145-
"print(y_test_probas[proba_columns].head(10))\n",
1146-
"print(\"---\")\n",
1147-
"\n",
1148-
"# Evaluate accuracy and auc metrics on the test dataset\n",
1149-
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n",
1150-
"test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n",
1151-
"print(f\"Test accuracy = {test_accuracy}\")\n",
1152-
"print(f\"Test auc = {test_auc}\")"
1153-
]
1154975
}
1155976
],
1156977
"metadata": {},

0 commit comments

Comments
 (0)