From bb4c63914d75bb8127c9d0617d8fb3632b58e438 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:13:16 +0100 Subject: [PATCH 1/2] Rename transform_pairs parameter to transform_type_pairs --- doc/samples/samples_sklearn.rst | 2 +- khiops/samples/samples_sklearn.ipynb | 2 +- khiops/samples/samples_sklearn.py | 2 +- khiops/sklearn/estimators.py | 22 ++++++++++++---------- tests/test_sklearn.py | 10 +++++----- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 22fc4cfd..b280181c 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -723,7 +723,7 @@ Samples keep_initial_variables=True, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_pairs="part_id", + transform_type_pairs="part_id", ) khe.fit(X, y) diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index e2aec0f1..11057461 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -865,7 +865,7 @@ " keep_initial_variables=True,\n", " transform_type_categorical=\"part_id\",\n", " transform_type_numerical=\"part_id\",\n", - " transform_pairs=\"part_id\",\n", + " transform_type_pairs=\"part_id\",\n", ")\n", "khe.fit(X, y)\n", "\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index da591810..ad431d9f 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -761,7 +761,7 @@ def khiops_encoder_with_hyperparameters(): keep_initial_variables=True, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_pairs="part_id", + transform_type_pairs="part_id", ) khe.fit(X, y) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3d5c122c..9e011ce0 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -2733,7 +2733,7 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): See the documentation for the ``numerical_recoding_method`` parameter of the `~.api.train_recoder` function for more details. - transform_pairs: str, default "part_id" + transform_type_pairs : str, default "part_id" Type of transformation for bivariate features. Valid values: - "part_id" - "part_label" @@ -2811,7 +2811,7 @@ def __init__( keep_initial_variables=False, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_pairs="part_id", + transform_type_pairs="part_id", verbose=False, output_dir=None, auto_sort=True, @@ -2835,7 +2835,7 @@ def __init__( self.group_target_value = group_target_value self.transform_type_categorical = transform_type_categorical self.transform_type_numerical = transform_type_numerical - self.transform_pairs = transform_pairs + self.transform_type_pairs = transform_type_pairs self.informative_features_only = informative_features_only self.keep_initial_variables = keep_initial_variables self._khiops_model_prefix = "R_" @@ -2892,12 +2892,12 @@ def _pairs_transform_method(self): "conditional_info": "conditional info", None: "none", } - if self.transform_pairs not in _transform_types: + if self.transform_type_pairs not in _transform_types: raise ValueError( - "'transform_pairs' must be one of the following:" + "'transform_type_pairs' must be one of the following:" ",".join(_transform_types.keys) ) - return _transform_types[self.transform_pairs] + return _transform_types[self.transform_type_pairs] def _fit_check_params(self, ds, **kwargs): # Call parent method @@ -2931,10 +2931,12 @@ def _fit_check_params(self, ds, **kwargs): "transform_type_categorical and transform_type_numerical " "cannot be both None with n_trees == 0." ) - # Check 'transform_pairs' parameter - if not isinstance(self.transform_pairs, str): + # Check 'transform_type_pairs' parameter + if not isinstance(self.transform_type_pairs, str): raise TypeError( - type_error_message("transform_pairs", self.transform_pairs, str) + type_error_message( + "transform_type_pairs", self.transform_type_pairs, str + ) ) self._pairs_transform_method() # Raises ValueError if invalid @@ -3036,7 +3038,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): del kwargs["transform_type_categorical"] del kwargs["transform_type_numerical"] - del kwargs["transform_pairs"] + del kwargs["transform_type_pairs"] del kwargs["categorical_target"] return args, kwargs diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 6e6d6bd3..48a5784f 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -2267,7 +2267,7 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe(self): "keep_initial_variables": False, "transform_type_categorical": "part_id", "transform_type_numerical": "part_id", - "transform_pairs": "part_id", + "transform_type_pairs": "part_id", }, ) @@ -2291,7 +2291,7 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y( "keep_initial_variables": False, "transform_type_categorical": "part_id", "transform_type_numerical": "part_id", - "transform_pairs": "part_id", + "transform_type_pairs": "part_id", }, ) @@ -2313,7 +2313,7 @@ def test_parameter_transfer_encoder_fit_from_monotable_file_dataset(self): "keep_initial_variables": False, "transform_type_categorical": "part_id", "transform_type_numerical": "part_id", - "transform_pairs": "part_id", + "transform_type_pairs": "part_id", }, ) @@ -2336,7 +2336,7 @@ def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self): "keep_initial_variables": False, "transform_type_categorical": "part_id", "transform_type_numerical": "part_id", - "transform_pairs": "part_id", + "transform_type_pairs": "part_id", }, ) @@ -2359,7 +2359,7 @@ def test_parameter_transfer_encoder_fit_from_multitable_file_dataset(self): "keep_initial_variables": False, "transform_type_categorical": "part_id", "transform_type_numerical": "part_id", - "transform_pairs": "part_id", + "transform_type_pairs": "part_id", }, ) From 3bf819ff756d68d03274d3b53f5b749478e56414 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:01:40 +0100 Subject: [PATCH 2/2] Control the types on sklearn internal read table We do this only for KhiopsClassifier and KhiopsRegressor: It is critical for KhiopsClassifier as it accepts many target types and it is trivial in the case of KhiopsRegressor. For KhiopsEncoder and KhiopsCoclustering is less critical and for the first one it is very complex. We left them as TODO's. Additionaly, we now also check in the "output type" tests that the result of predict is correct. Before we only checked only that the classes_ attribute was ok. This is to further ensure correctness. --- khiops/sklearn/dataset.py | 32 ++++- khiops/sklearn/estimators.py | 210 +++++++++++++++-------------- tests/test_sklearn_output_types.py | 93 ++++++++++--- 3 files changed, 213 insertions(+), 122 deletions(-) diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 26da745f..75b2fb5f 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -346,7 +346,7 @@ def get_khiops_variable_name(column_id): return variable_name -def read_internal_data_table(file_path_or_stream): +def read_internal_data_table(file_path_or_stream, column_dtypes=None): """Reads into a DataFrame a data table file with the internal format settings The table is read with the following settings: @@ -357,18 +357,34 @@ def read_internal_data_table(file_path_or_stream): - Use `csv.QUOTE_MINIMAL` - double quoting enabled (quotes within quotes can be escaped with '""') - UTF-8 encoding + - User-specified dtypes (optional) Parameters ---------- file_path_or_stream : str or file object The path of the internal data table file to be read or a readable file object. + column_dtypes : dict, optional + Dictionary linking column names with dtypes. See ``dtype`` parameter of the + `pandas.read_csv` function. If not set, then the column types are detected + automatically by pandas. Returns ------- `pandas.DataFrame` - The dataframe representation. + The dataframe representation of the data table. """ + # Change the 'U' types (Unicode strings) to 'O' because pandas does not support them + # in read_csv + if column_dtypes is not None: + execution_column_dtypes = {} + for column_name, dtype in column_dtypes.items(): + if hasattr(dtype, "kind") and dtype.kind == "U": + execution_column_dtypes[column_name] = np.dtype("O") + else: + execution_column_dtypes = None + + # Read and return the dataframe return pd.read_csv( file_path_or_stream, sep="\t", @@ -377,6 +393,7 @@ def read_internal_data_table(file_path_or_stream): quoting=csv.QUOTE_MINIMAL, doublequote=True, encoding="utf-8", + dtype=execution_column_dtypes, ) @@ -1132,6 +1149,11 @@ def __repr__(self): f"dtypes={dtypes_str}>" ) + def get_column_dtype(self, column_id): + if column_id not in self.data_source.dtypes: + raise KeyError(f"Column '{column_id}' not found in the dtypes field") + return self.data_source.dtypes[column_id] + def create_table_file_for_khiops( self, output_dir, sort=True, target_column=None, target_column_id=None ): @@ -1214,6 +1236,9 @@ def __repr__(self): f"dtype={dtype_str}; target={self.target_column_id}>" ) + def get_column_dtype(self, _): + return self.data_source.dtype + def create_table_file_for_khiops( self, output_dir, sort=True, target_column=None, target_column_id=None ): @@ -1300,6 +1325,9 @@ def __repr__(self): f"dtype={dtype_str}>" ) + def get_column_dtype(self, _): + return self.data_source.dtype + def create_khiops_dictionary(self): """Creates a Khiops dictionary representing this sparse table diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 9e011ce0..c69baff4 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -232,7 +232,7 @@ class KhiopsEstimator(ABC, BaseEstimator): The name of the column to be used as key. **Deprecated** will be removed in Khiops 11. internal_sort : bool, optional - *Advanced.*: See concrete estimator classes for information about this + *Advanced*: See concrete estimator classes for information about this parameter. **Deprecated** will be removed in Khiops 11. Use the ``auto_sort`` estimator parameter instead. @@ -470,7 +470,7 @@ def _transform( self, ds, computation_dir, - _transform_create_deployment_model_fun, + _transform_prepare_deployment_fun, drop_key, transformed_file_name, ): @@ -482,11 +482,13 @@ def _transform( self._transform_check_dataset(ds) # Create a deployment dataset - # Note: The input dataset is not necessarily ready to be deployed + # Note: The input dataset isn't ready for deployment in the case of coclustering deployment_ds = self._transform_create_deployment_dataset(ds, computation_dir) - # Create a deployment dictionary - deployment_dictionary_domain = _transform_create_deployment_model_fun(ds) + # Create a deployment dictionary and the internal table column dtypes + deployment_dictionary_domain, internal_table_column_dtypes = ( + _transform_prepare_deployment_fun(ds) + ) # Deploy the model output_table_path = self._transform_deploy_model( @@ -497,10 +499,36 @@ def _transform( transformed_file_name, ) - # Post-process to return the correct output type - return self._transform_deployment_post_process( - deployment_ds, output_table_path, drop_key - ) + # Post-process to return the correct output type and order + if deployment_ds.is_in_memory: + # Load the table as a dataframe + with io.BytesIO(fs.read(output_table_path)) as output_table_stream: + output_table_df = read_internal_data_table( + output_table_stream, column_dtypes=internal_table_column_dtypes + ) + + # On multi-table: + # - Reorder the table to the original table order + # - Because transformed data table file is sorted by key + # - Drop the key columns if specified + if deployment_ds.is_multitable: + key_df = deployment_ds.main_table.data_source[ + deployment_ds.main_table.key + ] + output_table_df_or_path = key_df.merge( + output_table_df, on=deployment_ds.main_table.key + ) + if drop_key: + output_table_df_or_path.drop( + deployment_ds.main_table.key, axis=1, inplace=True + ) + # On mono-table: Return the read dataframe as-is + else: + output_table_df_or_path = output_table_df + else: + output_table_df_or_path = output_table_path + + return output_table_df_or_path def _transform_create_deployment_dataset(self, ds, _): """Creates if necessary a new dataset to execute the model deployment @@ -609,44 +637,6 @@ def _transform_check_dataset(self, ds): if ds.table_type == FileTable and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") - def _transform_deployment_post_process( - self, deployment_ds, output_table_path, drop_key - ): - # Return a dataframe for dataframe based datasets - if deployment_ds.is_in_memory: - # Read the transformed table with the internal table settings - with io.BytesIO(fs.read(output_table_path)) as output_table_stream: - output_table_df = read_internal_data_table(output_table_stream) - - # On multi-table: - # - Reorder the table to the original table order - # - Because transformed data table file is sorted by key - # - Drop the key columns if specified - if deployment_ds.is_multitable: - key_df = deployment_ds.main_table.data_source[ - deployment_ds.main_table.key - ] - output_table_df_or_path = key_df.merge( - output_table_df, on=deployment_ds.main_table.key - ) - if drop_key: - output_table_df_or_path.drop( - deployment_ds.main_table.key, axis=1, inplace=True - ) - # On mono-table: Return the read dataframe as-is - else: - output_table_df_or_path = output_table_df - # Return a file path for file based datasets - else: - output_table_df_or_path = output_table_path - - assert isinstance( - output_table_df_or_path, (str, pd.DataFrame) - ), type_error_message( - "output_table_df_or_path", output_table_df_or_path, str, pd.DataFrame - ) - return output_table_df_or_path - def _create_computation_dir(self, method_name): """Creates a temporary computation directory""" return kh.get_runner().create_temp_dir( @@ -1266,7 +1256,7 @@ def predict(self, X): y_pred = super()._transform( ds, computation_dir, - self._transform_prepare_deployment_model_for_predict, + self._transform_prepare_deployment_for_predict, False, "predict.txt", ) @@ -1372,16 +1362,11 @@ def _transform_create_deployment_dataset(self, ds, computation_dir): return Dataset(deploy_dataset_spec) - def _transform_prepare_deployment_model_for_predict(self, _): - return self.model_.copy() - - def _transform_deployment_post_process( - self, deployment_ds, output_table_path, drop_key - ): - assert deployment_ds.is_multitable - return super()._transform_deployment_post_process( - deployment_ds, output_table_path, drop_key - ) + def _transform_prepare_deployment_for_predict(self, _): + # TODO: Replace the second return value (the output columns' dtypes) with a + # proper value instead of `None`. In the current state, it will use pandas + # type auto-detection to load the internal table into memory. + return self.model_.copy(), None def fit_predict(self, X, y=None, **kwargs): """Performs clustering on X and returns result (instead of labels)""" @@ -1418,6 +1403,7 @@ def __init__( self.specific_pairs = specific_pairs self.all_possible_pairs = all_possible_pairs self.construction_rules = construction_rules + self._original_target_dtype = None self._predicted_target_meta_data_tag = None # Deprecation message for 'key' constructor parameter @@ -1625,6 +1611,22 @@ def _fit_training_post_process(self, ds): # Call parent method super()._fit_training_post_process(ds) + # Save the target and key column dtype's + if ds.is_in_memory: + if self._original_target_dtype is None: + self._original_target_dtype = ds.target_column.dtype + if ds.main_table.key is not None: + self._original_key_dtypes = {} + for column_id in ds.main_table.key: + self._original_key_dtypes[column_id] = ( + ds.main_table.get_column_dtype(column_id) + ) + else: + self._original_key_dtypes = None + else: + self._original_target_dtype = None + self._original_key_dtypes = None + # Set the target variable name self.model_target_variable_name_ = get_khiops_variable_name(ds.target_column_id) @@ -1800,6 +1802,7 @@ def __init__( ) # Data to be specified by inherited classes self._predicted_target_meta_data_tag = None + self._predicted_target_name_prefix = None self.n_evaluated_features = n_evaluated_features self.n_selected_features = n_selected_features @@ -1827,7 +1830,7 @@ def predict(self, X): y_pred = super()._transform( ds, computation_dir, - self._transform_prepare_deployment_model_for_predict, + self._transform_prepare_deployment_for_predict, True, "predict.txt", ) @@ -1855,7 +1858,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): return args, kwargs - def _transform_prepare_deployment_model_for_predict(self, ds): + def _transform_prepare_deployment_for_predict(self, ds): assert ( self._predicted_target_meta_data_tag is not None ), "Predicted target metadata tag is not set" @@ -1880,7 +1883,20 @@ def _transform_prepare_deployment_model_for_predict(self, ds): if self.model_target_variable_name_ not in list(ds.main_table.column_ids): model_dictionary.remove_variable(self.model_target_variable_name_) - return model_copy + # Create the output column dtype dict + if ds.is_in_memory: + predicted_target_column_name = ( + self._predicted_target_name_prefix + self.model_target_variable_name_ + ) + output_columns_dtype = { + predicted_target_column_name: self._original_target_dtype + } + if self.is_multitable_model_: + output_columns_dtype.update(self._original_key_dtypes) + else: + output_columns_dtype = None + + return model_copy, output_columns_dtype def get_feature_used_statistics(self, modeling_report): # Extract, from the modeling report, names, levels, weights and importances @@ -1895,7 +1911,7 @@ def get_feature_used_statistics(self, modeling_report): for var in modeling_report.selected_variables ] ) - # Return empty arrays if not selected_variables is available + # Return empty arrays if no selected variables are available else: feature_used_names_ = np.array([], dtype=np.dtype("