@@ -164,7 +164,6 @@ Samples
164164 accidents_df = pd.read_csv(
165165 os.path.join(accidents_data_dir, " Accidents.txt" ),
166166 sep = " \t " ,
167- encoding = " latin1" ,
168167 )
169168 vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, " Vehicles.txt" ), sep = " \t " )
170169
@@ -218,33 +217,19 @@ Samples
218217
219218 # Load the dataset tables into dataframes
220219 accidents_data_dir = os.path.join(kh.get_samples_dir(), " Accidents" )
221- accidents_df = pd.read_csv(
222- os.path.join(accidents_data_dir, " Accidents.txt" ),
223- sep = " \t " ,
224- encoding = " latin1" ,
225- )
226- users_df = pd.read_csv(
227- os.path.join(accidents_data_dir, " Users.txt" ), sep = " \t " , encoding = " latin1"
228- )
229- vehicles_df = pd.read_csv(
230- os.path.join(accidents_data_dir, " Vehicles.txt" ),
231- sep = " \t " ,
232- encoding = " latin1" ,
233- )
234- places_df = pd.read_csv(
235- os.path.join(accidents_data_dir, " Places.txt" ), sep = " \t " , encoding = " latin1"
236- )
220+ accidents_df = pd.read_csv(os.path.join(accidents_data_dir, " Accidents.txt" ), sep = " \t " )
221+ users_df = pd.read_csv(os.path.join(accidents_data_dir, " Users.txt" ), sep = " \t " )
222+ vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, " Vehicles.txt" ), sep = " \t " )
223+ places_df = pd.read_csv(os.path.join(accidents_data_dir, " Places.txt" ), sep = " \t " )
237224
238- # Create the dataset spec
239- # Note: We discard the "Gravity" column from the "Users" table to avoid a target
240- # leak. This is because the column was used to build the target.
225+ # Build the multi-table dataset spec (drop the target column "Gravity")
241226 X = {
242227 " main_table" : " Accidents" ,
243228 " tables" : {
244- " Accidents" : (accidents_df, " AccidentId" ),
229+ " Accidents" : (accidents_df.drop( " Gravity " , axis = 1 ) , " AccidentId" ),
245230 " Vehicles" : (vehicles_df, [" AccidentId" , " VehicleId" ]),
246- " Users" : (users_df.drop( " Gravity " , axis = 1 ) , [" AccidentId" , " VehicleId" ]),
247- " Places" : (places_df, [ " AccidentId" ] ),
231+ " Users" : (users_df, [" AccidentId" , " VehicleId" ]),
232+ " Places" : (places_df, " AccidentId" ),
248233 },
249234 " relations" : [
250235 (" Accidents" , " Vehicles" ),
@@ -253,15 +238,8 @@ Samples
253238 ],
254239 }
255240
256- # Load the target variable "Gravity" from the "AccidentsSummary" dataset
257- y = pd.read_csv(
258- os.path.join(kh.get_samples_dir(), " AccidentsSummary" , " Accidents.txt" ),
259- usecols = [" Gravity" ],
260- sep = " \t " ,
261- encoding = " latin1" ,
262- ).squeeze(
263- " columns"
264- ) # squeeze to ensure pandas.Series
241+ # Load the target variable "Gravity"
242+ y = accidents_df[" Gravity" ]
265243
266244 # Split into train and test datasets
267245 X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)
@@ -406,7 +384,6 @@ Samples
406384 accidents_df = pd.read_csv(
407385 os.path.join(accidents_dataset_path, " Accidents.txt" ),
408386 sep = " \t " ,
409- encoding = " latin1" ,
410387 )
411388
412389 # Split the root dataframe into train and test
@@ -575,18 +552,19 @@ Samples
575552 accidents_df = pd.read_csv(
576553 os.path.join(accidents_data_dir, " Accidents.txt" ),
577554 sep = " \t " ,
578- encoding = " latin1" ,
579555 )
580556 vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, " Vehicles.txt" ), sep = " \t " )
581557
582- # Build the multi-table spec and the target
558+ # Build the multi-table dataset spec (drop the target column "Gravity")
583559 X = {
584560 " main_table" : " Accidents" ,
585561 " tables" : {
586562 " Accidents" : (accidents_df.drop(" Gravity" , axis = 1 ), " AccidentId" ),
587563 " Vehicles" : (vehicles_df, [" AccidentId" , " VehicleId" ]),
588564 },
589565 }
566+
567+ # Load the target variable "Gravity"
590568 y = accidents_df[" Gravity" ]
591569
592570 # Create the KhiopsEncoder with 5 multitable features and fit it
@@ -609,50 +587,29 @@ Samples
609587
610588 # Load the tables into dataframes
611589 accidents_data_dir = os.path.join(kh.get_samples_dir(), " Accidents" )
612- accidents_df = pd.read_csv(
613- os.path.join(accidents_data_dir, " Accidents.txt" ),
614- sep = " \t " ,
615- encoding = " latin1" ,
616- )
617- places_df = pd.read_csv(
618- os.path.join(accidents_data_dir, " Places.txt" ), sep = " \t " , encoding = " latin1"
619- )
620- users_df = pd.read_csv(
621- os.path.join(accidents_data_dir, " Users.txt" ), sep = " \t " , encoding = " latin1"
622- )
623- vehicles_df = pd.read_csv(
624- os.path.join(accidents_data_dir, " Vehicles.txt" ),
625- sep = " \t " ,
626- encoding = " latin1" ,
627- )
590+ accidents_df = pd.read_csv(os.path.join(accidents_data_dir, " Accidents.txt" ), sep = " \t " )
591+ users_df = pd.read_csv(os.path.join(accidents_data_dir, " Users.txt" ), sep = " \t " )
592+ vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, " Vehicles.txt" ), sep = " \t " )
593+ places_df = pd.read_csv(os.path.join(accidents_data_dir, " Places.txt" ), sep = " \t " )
628594
629- # Build the multi-table spec
630- # Note: We discard the "Gravity" field from the "Users" table as it was used to
631- # build the target column
595+ # Build the multi-table dataset spec (drop the target column "Gravity")
632596 X = {
633597 " main_table" : " Accidents" ,
634598 " tables" : {
635- " Accidents" : (accidents_df, " AccidentId" ),
636- " Places" : (places_df, " AccidentId" ),
599+ " Accidents" : (accidents_df.drop(" Gravity" , axis = 1 ), " AccidentId" ),
637600 " Vehicles" : (vehicles_df, [" AccidentId" , " VehicleId" ]),
638- " Users" : (users_df.drop(" Gravity" , axis = 1 ), [" AccidentId" , " VehicleId" ]),
601+ " Users" : (users_df, [" AccidentId" , " VehicleId" ]),
602+ " Places" : (places_df, " AccidentId" ),
639603 },
640604 " relations" : [
641605 (" Accidents" , " Vehicles" ),
642- (" Accidents" , " Places" , True ),
643606 (" Vehicles" , " Users" ),
607+ (" Accidents" , " Places" , True ),
644608 ],
645609 }
646610
647- # Load the target variable from the AccidentsSummary dataset
648- y = pd.read_csv(
649- os.path.join(kh.get_samples_dir(), " AccidentsSummary" , " Accidents.txt" ),
650- usecols = [" Gravity" ],
651- sep = " \t " ,
652- encoding = " latin1" ,
653- ).squeeze(
654- " columns"
655- ) # squeeze to ensure pandas.Series
611+ # Load the target variable "Gravity"
612+ y = accidents_df[" Gravity" ]
656613
657614 # Create the KhiopsEncoder with 10 additional multitable features and fit it
658615 khe = KhiopsEncoder(n_features = 10 )
@@ -735,33 +692,26 @@ Samples
735692 from khiops import core as kh
736693 from khiops.sklearn import KhiopsEncoder
737694
738- # Load the root table of the dataset into a pandas dataframe
739- accidents_dataset_path = os.path.join(kh.get_samples_dir(), " AccidentsSummary" )
740- accidents_df = pd.read_csv(
741- os.path.join(accidents_dataset_path, " Accidents.txt" ),
742- sep = " \t " ,
743- encoding = " latin1" ,
744- )
745-
746- # Obtain the root X feature table and the y target vector ("Class" column)
747- X_main = accidents_df.drop(" Gravity" , axis = 1 )
748- y = accidents_df[" Gravity" ]
749-
750- # Load the secondary table of the dataset into a pandas dataframe
751- X_secondary = pd.read_csv(
752- os.path.join(accidents_dataset_path, " Vehicles.txt" ), sep = " \t "
753- )
695+ # Load the tables into dataframes
696+ accidents_data_dir = os.path.join(kh.get_samples_dir(), " AccidentsSummary" )
697+ accidents_df = pd.read_csv(os.path.join(accidents_data_dir, " Accidents.txt" ), sep = " \t " )
698+ vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, " Vehicles.txt" ), sep = " \t " )
754699
755- # Create the dataset multitable specification for the train/test split
756- # We specify each table with a name and a tuple (dataframe, key_columns)
757- X_dataset = {
700+ # Build the multi-table dataset spec (drop the target column "Gravity")
701+ X = {
758702 " main_table" : " Accidents" ,
759703 " tables" : {
760- " Accidents" : (X_main , " AccidentId" ),
761- " Vehicles" : (X_secondary , [" AccidentId" , " VehicleId" ]),
704+ " Accidents" : (accidents_df.drop( " Gravity " , axis = 1 ) , " AccidentId" ),
705+ " Vehicles" : (vehicles_df , [" AccidentId" , " VehicleId" ]),
762706 },
707+ " relations" : [
708+ (" Accidents" , " Vehicles" ),
709+ ],
763710 }
764711
712+ # Load the target variable "Gravity"
713+ y = accidents_df[" Gravity" ]
714+
765715 # Create the KhiopsEncoder with 10 additional multitable features and fit it
766716 khe = KhiopsEncoder(
767717 n_features = 20 ,
@@ -777,13 +727,13 @@ Samples
777727 transform_type_numerical = " part_id" ,
778728 transform_pairs = " part_id" ,
779729 )
780- khe.fit(X_dataset , y)
730+ khe.fit(X , y)
781731
782732 # Transform the train dataset
783733 print (" Encoded feature names:" )
784734 print (khe.feature_names_out_)
785735 print (" Encoded data:" )
786- print (khe.transform(X_dataset )[:10 ])
736+ print (khe.transform(X )[:10 ])
787737 .. autofunction :: khiops_coclustering
788738.. code-block :: python
789739
@@ -867,7 +817,6 @@ Samples
867817 accidents_df = pd.read_csv(
868818 os.path.join(accidents_data_dir, " Accidents.txt" ),
869819 sep = " \t " ,
870- encoding = " latin1" ,
871820 )
872821 X = accidents_df.drop(" Gravity" , axis = 1 )
873822 y = accidents_df[" Gravity" ]
@@ -932,7 +881,6 @@ Samples
932881 accidents_df = pd.read_csv(
933882 os.path.join(accidents_dataset_path, " Accidents.txt" ),
934883 sep = " \t " ,
935- encoding = " latin1" ,
936884 )
937885
938886 # Split the root dataframe into train and test
0 commit comments