diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 9353bbd5..5c9e071c 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -404,6 +404,10 @@ def write_internal_data_table(dataframe, file_path_or_stream): - UTF-8 encoding - The index is not written + Khiops cannot handle multi-line records. + Hence, the carriage returns / line feeds need to be removed from the records + before data is handed over to Khiops. + Parameters ---------- dataframe : `pandas.DataFrame` @@ -412,6 +416,10 @@ def write_internal_data_table(dataframe, file_path_or_stream): The path of the internal data table file to be written or a writable file object. """ + # Replace carriage returns / line feeds by blanks spaces + # in order to always keep mono-lines text fields + dataframe = dataframe.replace(["\r", "\n"], " ", regex=True) + dataframe.to_csv( file_path_or_stream, sep="\t", diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 02946e50..f24afd81 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -223,6 +223,13 @@ def _cleanup_dir(target_dir): class KhiopsEstimator(ABC, BaseEstimator): """Base class for Khiops Scikit-learn estimators + .. note:: + The input features collection X needs to have single-line records + so that Khiops can handle them. + Hence, multi-line records are preprocessed: + carriage returns / line feeds are replaced + with blank spaces before being handed over to Khiops. + Parameters ---------- verbose : bool, default ``False`` @@ -1695,7 +1702,7 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): .. note:: Visit `the Khiops site `_ to learn - abouth the automatic feature engineering algorithm. + about the automatic feature engineering algorithm. Parameters ---------- diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index ab94dfa4..b6d32d69 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -4,7 +4,6 @@ # which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # # see the "LICENSE.md" file for more details. # ###################################################################################### -"""Test consistency of the created files with the input data""" import os import shutil import unittest @@ -700,3 +699,47 @@ def _test_domain_coherence(self, ds, ref_var_types): for var in out_domain.get_dictionary(table.name).variables } self.assertEqual(ref_var_types[table.name], out_dictionary_var_types) + + +class DataFramePreprocessingTests(unittest.TestCase): + """Check that the preprocessing of X (input features collection) is actually done + when writing the csv used later by Khiops + """ + + def setUp(self): + """Set-up test-specific output directory""" + self.output_dir = os.path.join("resources", "tmp", self._testMethodName) + os.makedirs(self.output_dir, exist_ok=True) + + def tearDown(self): + """Clean-up test-specific output directory""" + shutil.rmtree(self.output_dir, ignore_errors=True) + del self.output_dir + + @staticmethod + def create_monotable_dataset_with_newlines(): + data = { + "User_ID": [ + "Cm6fu01r99", + ], + "Age": [39], + "Title": [ + "Shimmer,\nsurprisingly\n\rgoes with lots", + ], + } + dataset = pd.DataFrame(data) + return dataset + + def test_newlines_removed_from_csv_file_for_khiops(self): + dataset = Dataset( + DataFramePreprocessingTests.create_monotable_dataset_with_newlines() + ) + + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = pd.read_csv(out_table_path, sep="\t") + + self.assertEqual( + "Shimmer, surprisingly goes with lots", + out_table.Title[0], + "Newlines should have been removed from the data", + )