diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 22fc4cfd..b280181c 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -723,7 +723,7 @@ Samples keep_initial_variables=True, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_pairs="part_id", + transform_type_pairs="part_id", ) khe.fit(X, y) diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index e2aec0f1..11057461 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -865,7 +865,7 @@ " keep_initial_variables=True,\n", " transform_type_categorical=\"part_id\",\n", " transform_type_numerical=\"part_id\",\n", - " transform_pairs=\"part_id\",\n", + " transform_type_pairs=\"part_id\",\n", ")\n", "khe.fit(X, y)\n", "\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index da591810..ad431d9f 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -761,7 +761,7 @@ def khiops_encoder_with_hyperparameters(): keep_initial_variables=True, transform_type_categorical="part_id", transform_type_numerical="part_id", - transform_pairs="part_id", + transform_type_pairs="part_id", ) khe.fit(X, y) diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 26da745f..75b2fb5f 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -346,7 +346,7 @@ def get_khiops_variable_name(column_id): return variable_name -def read_internal_data_table(file_path_or_stream): +def read_internal_data_table(file_path_or_stream, column_dtypes=None): """Reads into a DataFrame a data table file with the internal format settings The table is read with the following settings: @@ -357,18 +357,34 @@ def read_internal_data_table(file_path_or_stream): - Use `csv.QUOTE_MINIMAL` - double quoting enabled (quotes within quotes can be escaped with '""') - UTF-8 encoding + - User-specified dtypes (optional) Parameters ---------- file_path_or_stream : str or file object The path of the internal data table file to be read or a readable file object. + column_dtypes : dict, optional + Dictionary linking column names with dtypes. See ``dtype`` parameter of the + `pandas.read_csv` function. If not set, then the column types are detected + automatically by pandas. Returns ------- `pandas.DataFrame` - The dataframe representation. + The dataframe representation of the data table. """ + # Change the 'U' types (Unicode strings) to 'O' because pandas does not support them + # in read_csv + if column_dtypes is not None: + execution_column_dtypes = {} + for column_name, dtype in column_dtypes.items(): + if hasattr(dtype, "kind") and dtype.kind == "U": + execution_column_dtypes[column_name] = np.dtype("O") + else: + execution_column_dtypes = None + + # Read and return the dataframe return pd.read_csv( file_path_or_stream, sep="\t", @@ -377,6 +393,7 @@ def read_internal_data_table(file_path_or_stream): quoting=csv.QUOTE_MINIMAL, doublequote=True, encoding="utf-8", + dtype=execution_column_dtypes, ) @@ -1132,6 +1149,11 @@ def __repr__(self): f"dtypes={dtypes_str}>" ) + def get_column_dtype(self, column_id): + if column_id not in self.data_source.dtypes: + raise KeyError(f"Column '{column_id}' not found in the dtypes field") + return self.data_source.dtypes[column_id] + def create_table_file_for_khiops( self, output_dir, sort=True, target_column=None, target_column_id=None ): @@ -1214,6 +1236,9 @@ def __repr__(self): f"dtype={dtype_str}; target={self.target_column_id}>" ) + def get_column_dtype(self, _): + return self.data_source.dtype + def create_table_file_for_khiops( self, output_dir, sort=True, target_column=None, target_column_id=None ): @@ -1300,6 +1325,9 @@ def __repr__(self): f"dtype={dtype_str}>" ) + def get_column_dtype(self, _): + return self.data_source.dtype + def create_khiops_dictionary(self): """Creates a Khiops dictionary representing this sparse table diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3d5c122c..c69baff4 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -232,7 +232,7 @@ class KhiopsEstimator(ABC, BaseEstimator): The name of the column to be used as key. **Deprecated** will be removed in Khiops 11. internal_sort : bool, optional - *Advanced.*: See concrete estimator classes for information about this + *Advanced*: See concrete estimator classes for information about this parameter. **Deprecated** will be removed in Khiops 11. Use the ``auto_sort`` estimator parameter instead. @@ -470,7 +470,7 @@ def _transform( self, ds, computation_dir, - _transform_create_deployment_model_fun, + _transform_prepare_deployment_fun, drop_key, transformed_file_name, ): @@ -482,11 +482,13 @@ def _transform( self._transform_check_dataset(ds) # Create a deployment dataset - # Note: The input dataset is not necessarily ready to be deployed + # Note: The input dataset isn't ready for deployment in the case of coclustering deployment_ds = self._transform_create_deployment_dataset(ds, computation_dir) - # Create a deployment dictionary - deployment_dictionary_domain = _transform_create_deployment_model_fun(ds) + # Create a deployment dictionary and the internal table column dtypes + deployment_dictionary_domain, internal_table_column_dtypes = ( + _transform_prepare_deployment_fun(ds) + ) # Deploy the model output_table_path = self._transform_deploy_model( @@ -497,10 +499,36 @@ def _transform( transformed_file_name, ) - # Post-process to return the correct output type - return self._transform_deployment_post_process( - deployment_ds, output_table_path, drop_key - ) + # Post-process to return the correct output type and order + if deployment_ds.is_in_memory: + # Load the table as a dataframe + with io.BytesIO(fs.read(output_table_path)) as output_table_stream: + output_table_df = read_internal_data_table( + output_table_stream, column_dtypes=internal_table_column_dtypes + ) + + # On multi-table: + # - Reorder the table to the original table order + # - Because transformed data table file is sorted by key + # - Drop the key columns if specified + if deployment_ds.is_multitable: + key_df = deployment_ds.main_table.data_source[ + deployment_ds.main_table.key + ] + output_table_df_or_path = key_df.merge( + output_table_df, on=deployment_ds.main_table.key + ) + if drop_key: + output_table_df_or_path.drop( + deployment_ds.main_table.key, axis=1, inplace=True + ) + # On mono-table: Return the read dataframe as-is + else: + output_table_df_or_path = output_table_df + else: + output_table_df_or_path = output_table_path + + return output_table_df_or_path def _transform_create_deployment_dataset(self, ds, _): """Creates if necessary a new dataset to execute the model deployment @@ -609,44 +637,6 @@ def _transform_check_dataset(self, ds): if ds.table_type == FileTable and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") - def _transform_deployment_post_process( - self, deployment_ds, output_table_path, drop_key - ): - # Return a dataframe for dataframe based datasets - if deployment_ds.is_in_memory: - # Read the transformed table with the internal table settings - with io.BytesIO(fs.read(output_table_path)) as output_table_stream: - output_table_df = read_internal_data_table(output_table_stream) - - # On multi-table: - # - Reorder the table to the original table order - # - Because transformed data table file is sorted by key - # - Drop the key columns if specified - if deployment_ds.is_multitable: - key_df = deployment_ds.main_table.data_source[ - deployment_ds.main_table.key - ] - output_table_df_or_path = key_df.merge( - output_table_df, on=deployment_ds.main_table.key - ) - if drop_key: - output_table_df_or_path.drop( - deployment_ds.main_table.key, axis=1, inplace=True - ) - # On mono-table: Return the read dataframe as-is - else: - output_table_df_or_path = output_table_df - # Return a file path for file based datasets - else: - output_table_df_or_path = output_table_path - - assert isinstance( - output_table_df_or_path, (str, pd.DataFrame) - ), type_error_message( - "output_table_df_or_path", output_table_df_or_path, str, pd.DataFrame - ) - return output_table_df_or_path - def _create_computation_dir(self, method_name): """Creates a temporary computation directory""" return kh.get_runner().create_temp_dir( @@ -1266,7 +1256,7 @@ def predict(self, X): y_pred = super()._transform( ds, computation_dir, - self._transform_prepare_deployment_model_for_predict, + self._transform_prepare_deployment_for_predict, False, "predict.txt", ) @@ -1372,16 +1362,11 @@ def _transform_create_deployment_dataset(self, ds, computation_dir): return Dataset(deploy_dataset_spec) - def _transform_prepare_deployment_model_for_predict(self, _): - return self.model_.copy() - - def _transform_deployment_post_process( - self, deployment_ds, output_table_path, drop_key - ): - assert deployment_ds.is_multitable - return super()._transform_deployment_post_process( - deployment_ds, output_table_path, drop_key - ) + def _transform_prepare_deployment_for_predict(self, _): + # TODO: Replace the second return value (the output columns' dtypes) with a + # proper value instead of `None`. In the current state, it will use pandas + # type auto-detection to load the internal table into memory. + return self.model_.copy(), None def fit_predict(self, X, y=None, **kwargs): """Performs clustering on X and returns result (instead of labels)""" @@ -1418,6 +1403,7 @@ def __init__( self.specific_pairs = specific_pairs self.all_possible_pairs = all_possible_pairs self.construction_rules = construction_rules + self._original_target_dtype = None self._predicted_target_meta_data_tag = None # Deprecation message for 'key' constructor parameter @@ -1625,6 +1611,22 @@ def _fit_training_post_process(self, ds): # Call parent method super()._fit_training_post_process(ds) + # Save the target and key column dtype's + if ds.is_in_memory: + if self._original_target_dtype is None: + self._original_target_dtype = ds.target_column.dtype + if ds.main_table.key is not None: + self._original_key_dtypes = {} + for column_id in ds.main_table.key: + self._original_key_dtypes[column_id] = ( + ds.main_table.get_column_dtype(column_id) + ) + else: + self._original_key_dtypes = None + else: + self._original_target_dtype = None + self._original_key_dtypes = None + # Set the target variable name self.model_target_variable_name_ = get_khiops_variable_name(ds.target_column_id) @@ -1800,6 +1802,7 @@ def __init__( ) # Data to be specified by inherited classes self._predicted_target_meta_data_tag = None + self._predicted_target_name_prefix = None self.n_evaluated_features = n_evaluated_features self.n_selected_features = n_selected_features @@ -1827,7 +1830,7 @@ def predict(self, X): y_pred = super()._transform( ds, computation_dir, - self._transform_prepare_deployment_model_for_predict, + self._transform_prepare_deployment_for_predict, True, "predict.txt", ) @@ -1855,7 +1858,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): return args, kwargs - def _transform_prepare_deployment_model_for_predict(self, ds): + def _transform_prepare_deployment_for_predict(self, ds): assert ( self._predicted_target_meta_data_tag is not None ), "Predicted target metadata tag is not set" @@ -1880,7 +1883,20 @@ def _transform_prepare_deployment_model_for_predict(self, ds): if self.model_target_variable_name_ not in list(ds.main_table.column_ids): model_dictionary.remove_variable(self.model_target_variable_name_) - return model_copy + # Create the output column dtype dict + if ds.is_in_memory: + predicted_target_column_name = ( + self._predicted_target_name_prefix + self.model_target_variable_name_ + ) + output_columns_dtype = { + predicted_target_column_name: self._original_target_dtype + } + if self.is_multitable_model_: + output_columns_dtype.update(self._original_key_dtypes) + else: + output_columns_dtype = None + + return model_copy, output_columns_dtype def get_feature_used_statistics(self, modeling_report): # Extract, from the modeling report, names, levels, weights and importances @@ -1895,7 +1911,7 @@ def get_feature_used_statistics(self, modeling_report): for var in modeling_report.selected_variables ] ) - # Return empty arrays if not selected_variables is available + # Return empty arrays if no selected variables are available else: feature_used_names_ = np.array([], dtype=np.dtype("