Skip to content

Commit 3c47c76

Browse files
Update samples for dataset samples 10.2.4
1 parent 4a2f300 commit 3c47c76

File tree

7 files changed

+145
-301
lines changed

7 files changed

+145
-301
lines changed

.github/workflows/unit-tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
---
22
name: Unit Tests
33
env:
4-
DEFAULT_SAMPLES_REVISION: 10.2.0
4+
DEFAULT_SAMPLES_REVISION: 10.2.4
55
DEFAULT_KHIOPS_DESKTOP_REVISION: 10.2.3
66
on:
77
workflow_dispatch:
88
inputs:
99
samples-revision:
10-
default: 10.2.0
11-
description: Git tag, branch or commit for the khiops-samples repository
10+
default: 10.2.4
11+
description: Git Tag/Branch/Commit for the khiops-samples Repo
1212
image-tag:
1313
default: latest
1414
description: Development Docker Image Tag

doc/multi_table_primer.rst

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ schema:
135135
136136
We build the input ``X`` as follows::
137137

138-
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1")
139-
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t", encoding="latin1")
138+
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t")
139+
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t")
140140
X = {
141141
"main_table" : "Accident",
142142
"tables": {
@@ -164,19 +164,18 @@ through the following *snowflake* schema
164164
165165
We build the input ``X`` as follows::
166166

167-
# We use `Accidents.txt` table of `AccidentsSummary` as it contains the `Gravity` label pre-calculated
168-
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1")
169-
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t", encoding="latin1")
170-
users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t", encoding="latin1")
171-
places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t", encoding="latin1")
167+
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Accidents.txt", sep="\t")
168+
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t")
169+
users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t")
170+
places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t")
172171

173172
X = {
174173
"main_table": "Accidents",
175174
"tables": {
176175
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
177176
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
178177
"Users": (users_df, ["AccidentId", "VehicleId"]),
179-
"Places": (places_df, ["AccidentId"]),
178+
"Places": (places_df, "AccidentId"),
180179

181180
},
182181
"relations": [

doc/samples/samples_sklearn.rst

Lines changed: 40 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,6 @@ Samples
164164
accidents_df = pd.read_csv(
165165
os.path.join(accidents_data_dir, "Accidents.txt"),
166166
sep="\t",
167-
encoding="latin1",
168167
)
169168
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
170169
@@ -218,33 +217,19 @@ Samples
218217
219218
# Load the dataset tables into dataframes
220219
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
221-
accidents_df = pd.read_csv(
222-
os.path.join(accidents_data_dir, "Accidents.txt"),
223-
sep="\t",
224-
encoding="latin1",
225-
)
226-
users_df = pd.read_csv(
227-
os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1"
228-
)
229-
vehicles_df = pd.read_csv(
230-
os.path.join(accidents_data_dir, "Vehicles.txt"),
231-
sep="\t",
232-
encoding="latin1",
233-
)
234-
places_df = pd.read_csv(
235-
os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1"
236-
)
220+
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
221+
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
222+
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
223+
places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t")
237224
238-
# Create the dataset spec
239-
# Note: We discard the "Gravity" column from the "Users" table to avoid a target
240-
# leak. This is because the column was used to build the target.
225+
# Build the multi-table dataset spec (drop the target column "Gravity")
241226
X = {
242227
"main_table": "Accidents",
243228
"tables": {
244-
"Accidents": (accidents_df, "AccidentId"),
229+
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
245230
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
246-
"Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]),
247-
"Places": (places_df, ["AccidentId"]),
231+
"Users": (users_df, ["AccidentId", "VehicleId"]),
232+
"Places": (places_df, "AccidentId"),
248233
},
249234
"relations": [
250235
("Accidents", "Vehicles"),
@@ -253,15 +238,8 @@ Samples
253238
],
254239
}
255240
256-
# Load the target variable "Gravity" from the "AccidentsSummary" dataset
257-
y = pd.read_csv(
258-
os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"),
259-
usecols=["Gravity"],
260-
sep="\t",
261-
encoding="latin1",
262-
).squeeze(
263-
"columns"
264-
) # squeeze to ensure pandas.Series
241+
# Load the target variable "Gravity"
242+
y = accidents_df["Gravity"]
265243
266244
# Split into train and test datasets
267245
X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)
@@ -406,7 +384,6 @@ Samples
406384
accidents_df = pd.read_csv(
407385
os.path.join(accidents_dataset_path, "Accidents.txt"),
408386
sep="\t",
409-
encoding="latin1",
410387
)
411388
412389
# Split the root dataframe into train and test
@@ -575,18 +552,19 @@ Samples
575552
accidents_df = pd.read_csv(
576553
os.path.join(accidents_data_dir, "Accidents.txt"),
577554
sep="\t",
578-
encoding="latin1",
579555
)
580556
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
581557
582-
# Build the multi-table spec and the target
558+
# Build the multi-table dataset spec (drop the target column "Gravity")
583559
X = {
584560
"main_table": "Accidents",
585561
"tables": {
586562
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
587563
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
588564
},
589565
}
566+
567+
# Load the target variable "Gravity"
590568
y = accidents_df["Gravity"]
591569
592570
# Create the KhiopsEncoder with 5 multitable features and fit it
@@ -609,50 +587,29 @@ Samples
609587
610588
# Load the tables into dataframes
611589
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
612-
accidents_df = pd.read_csv(
613-
os.path.join(accidents_data_dir, "Accidents.txt"),
614-
sep="\t",
615-
encoding="latin1",
616-
)
617-
places_df = pd.read_csv(
618-
os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1"
619-
)
620-
users_df = pd.read_csv(
621-
os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1"
622-
)
623-
vehicles_df = pd.read_csv(
624-
os.path.join(accidents_data_dir, "Vehicles.txt"),
625-
sep="\t",
626-
encoding="latin1",
627-
)
590+
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
591+
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
592+
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
593+
places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t")
628594
629-
# Build the multi-table spec
630-
# Note: We discard the "Gravity" field from the "Users" table as it was used to
631-
# build the target column
595+
# Build the multi-table dataset spec (drop the target column "Gravity")
632596
X = {
633597
"main_table": "Accidents",
634598
"tables": {
635-
"Accidents": (accidents_df, "AccidentId"),
636-
"Places": (places_df, "AccidentId"),
599+
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
637600
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
638-
"Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]),
601+
"Users": (users_df, ["AccidentId", "VehicleId"]),
602+
"Places": (places_df, "AccidentId"),
639603
},
640604
"relations": [
641605
("Accidents", "Vehicles"),
642-
("Accidents", "Places", True),
643606
("Vehicles", "Users"),
607+
("Accidents", "Places", True),
644608
],
645609
}
646610
647-
# Load the target variable from the AccidentsSummary dataset
648-
y = pd.read_csv(
649-
os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"),
650-
usecols=["Gravity"],
651-
sep="\t",
652-
encoding="latin1",
653-
).squeeze(
654-
"columns"
655-
) # squeeze to ensure pandas.Series
611+
# Load the target variable "Gravity"
612+
y = accidents_df["Gravity"]
656613
657614
# Create the KhiopsEncoder with 10 additional multitable features and fit it
658615
khe = KhiopsEncoder(n_features=10)
@@ -735,33 +692,26 @@ Samples
735692
from khiops import core as kh
736693
from khiops.sklearn import KhiopsEncoder
737694
738-
# Load the root table of the dataset into a pandas dataframe
739-
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
740-
accidents_df = pd.read_csv(
741-
os.path.join(accidents_dataset_path, "Accidents.txt"),
742-
sep="\t",
743-
encoding="latin1",
744-
)
745-
746-
# Obtain the root X feature table and the y target vector ("Class" column)
747-
X_main = accidents_df.drop("Gravity", axis=1)
748-
y = accidents_df["Gravity"]
749-
750-
# Load the secondary table of the dataset into a pandas dataframe
751-
X_secondary = pd.read_csv(
752-
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
753-
)
695+
# Load the tables into dataframes
696+
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
697+
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
698+
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
754699
755-
# Create the dataset multitable specification for the train/test split
756-
# We specify each table with a name and a tuple (dataframe, key_columns)
757-
X_dataset = {
700+
# Build the multi-table dataset spec (drop the target column "Gravity")
701+
X = {
758702
"main_table": "Accidents",
759703
"tables": {
760-
"Accidents": (X_main, "AccidentId"),
761-
"Vehicles": (X_secondary, ["AccidentId", "VehicleId"]),
704+
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
705+
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
762706
},
707+
"relations": [
708+
("Accidents", "Vehicles"),
709+
],
763710
}
764711
712+
# Load the target variable "Gravity"
713+
y = accidents_df["Gravity"]
714+
765715
# Create the KhiopsEncoder with 10 additional multitable features and fit it
766716
khe = KhiopsEncoder(
767717
n_features=20,
@@ -777,13 +727,13 @@ Samples
777727
transform_type_numerical="part_id",
778728
transform_pairs="part_id",
779729
)
780-
khe.fit(X_dataset, y)
730+
khe.fit(X, y)
781731
782732
# Transform the train dataset
783733
print("Encoded feature names:")
784734
print(khe.feature_names_out_)
785735
print("Encoded data:")
786-
print(khe.transform(X_dataset)[:10])
736+
print(khe.transform(X)[:10])
787737
.. autofunction:: khiops_coclustering
788738
.. code-block:: python
789739
@@ -867,7 +817,6 @@ Samples
867817
accidents_df = pd.read_csv(
868818
os.path.join(accidents_data_dir, "Accidents.txt"),
869819
sep="\t",
870-
encoding="latin1",
871820
)
872821
X = accidents_df.drop("Gravity", axis=1)
873822
y = accidents_df["Gravity"]
@@ -932,7 +881,6 @@ Samples
932881
accidents_df = pd.read_csv(
933882
os.path.join(accidents_dataset_path, "Accidents.txt"),
934883
sep="\t",
935-
encoding="latin1",
936884
)
937885
938886
# Split the root dataframe into train and test

0 commit comments

Comments
 (0)