Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions khiops/sklearn/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
- UTF-8 encoding
- The index is not written

Khiops cannot handle multi-line records.
Hence, the carriage returns / line feeds need to be removed from the records
before data is handed over to Khiops.

Parameters
----------
dataframe : `pandas.DataFrame`
Expand All @@ -412,6 +416,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
The path of the internal data table file to be written or a writable file
object.
"""
# Replace carriage returns / line feeds by blanks spaces
# in order to always keep mono-lines text fields
dataframe = dataframe.replace(["\r", "\n"], " ", regex=True)

dataframe.to_csv(
file_path_or_stream,
sep="\t",
Expand Down
9 changes: 8 additions & 1 deletion khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ def _cleanup_dir(target_dir):
class KhiopsEstimator(ABC, BaseEstimator):
"""Base class for Khiops Scikit-learn estimators

.. note::
The input features collection X needs to have single-line records
so that Khiops can handle them.
Hence, multi-line records are preprocessed:
carriage returns / line feeds are replaced
with blank spaces before being handed over to Khiops.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add:

A memory penalty is incurred, as a new dataframe is generated following the preprocessing.


Parameters
----------
verbose : bool, default ``False``
Expand Down Expand Up @@ -1695,7 +1702,7 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
.. note::

Visit `the Khiops site <https://khiops.org/learn/understand>`_ to learn
abouth the automatic feature engineering algorithm.
about the automatic feature engineering algorithm.

Parameters
----------
Expand Down
45 changes: 44 additions & 1 deletion tests/test_dataset_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or #
# see the "LICENSE.md" file for more details. #
######################################################################################
"""Test consistency of the created files with the input data"""
import os
import shutil
import unittest
Expand Down Expand Up @@ -700,3 +699,47 @@ def _test_domain_coherence(self, ds, ref_var_types):
for var in out_domain.get_dictionary(table.name).variables
}
self.assertEqual(ref_var_types[table.name], out_dictionary_var_types)


class DataFramePreprocessingTests(unittest.TestCase):
"""Check that the preprocessing of X (input features collection) is actually done
when writing the csv used later by Khiops
"""

def setUp(self):
"""Set-up test-specific output directory"""
self.output_dir = os.path.join("resources", "tmp", self._testMethodName)
os.makedirs(self.output_dir, exist_ok=True)

def tearDown(self):
"""Clean-up test-specific output directory"""
shutil.rmtree(self.output_dir, ignore_errors=True)
del self.output_dir

@staticmethod
def create_monotable_dataset_with_newlines():
data = {
"User_ID": [
"Cm6fu01r99",
],
"Age": [39],
"Title": [
"Shimmer,\nsurprisingly\n\rgoes with lots",
],
}
dataset = pd.DataFrame(data)
return dataset

def test_newlines_removed_from_csv_file_for_khiops(self):
dataset = Dataset(
DataFramePreprocessingTests.create_monotable_dataset_with_newlines()
)

out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir)
out_table = pd.read_csv(out_table_path, sep="\t")

self.assertEqual(
"Shimmer, surprisingly goes with lots",
out_table.Title[0],
"Newlines should have been removed from the data",
)