Skip to content

Commit cea9546

Browse files
committed
Add "Text" Khiops type assignment heuristic
Thus, "Text" is assigned to a dataframe input column if: - its type is StringDType, and - its length is > 100 (same heuristic as Khiops core when called through `api.build_dictionary_from_data_table`).
1 parent 9548b64 commit cea9546

2 files changed

Lines changed: 44 additions & 14 deletions

File tree

khiops/sklearn/dataset.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,18 +236,25 @@ def _upgrade_mapping_spec(ds_spec):
236236
return new_ds_spec
237237

238238

239-
def get_khiops_type(numpy_type):
239+
def get_khiops_type(numpy_type, max_type_size=None):
240240
"""Translates a numpy dtype to a Khiops dictionary type
241241
242242
Parameters
243243
----------
244-
numpy_type : `numpy.dtype`:
244+
numpy_type : `numpy.dtype`
245245
Numpy type of the column
246+
max_type_size : `int`, optional
247+
Maximum length of the entries of the column whose type is ``numpy_type``.
246248
247249
Returns
248250
-------
249251
str
250-
Khiops type name. Either "Categorical", "Numerical" or "Timestamp"
252+
Khiops type name. Either "Categorical", "Text", "Numerical" or "Timestamp".
253+
254+
.. note::
255+
The "Text" Khiops type is inferred if the Numpy type is "string"
256+
and the maximum length of the entries of that type is greater than 100.
257+
251258
"""
252259
lower_numpy_type = str(numpy_type).lower()
253260

@@ -257,6 +264,18 @@ def get_khiops_type(numpy_type):
257264
# float<x>, int<x>, uint<x> types
258265
elif "int" in lower_numpy_type or "float" in lower_numpy_type:
259266
khiops_type = "Numerical"
267+
elif lower_numpy_type == "string":
268+
if max_type_size is not None:
269+
if not isinstance(max_type_size, (int, np.int64)):
270+
raise TypeError(
271+
type_error_message("max_type_size", max_type_size, int, np.int64)
272+
)
273+
if max_type_size > 100:
274+
khiops_type = "Text"
275+
else:
276+
khiops_type = "Categorical"
277+
else:
278+
khiops_type = "Categorical"
260279
# bool_ and object, character, bytes_, str_, void, record and other types
261280
else:
262281
khiops_type = "Categorical"
@@ -956,10 +975,14 @@ def __init__(self, name, dataframe, key=None):
956975
)
957976

958977
# Initialize Khiops types
959-
self.khiops_types = {
960-
column_id: get_khiops_type(self.data_source.dtypes[column_id])
961-
for column_id in self.column_ids
962-
}
978+
self.khiops_types = {}
979+
for column_id in self.column_ids:
980+
column = self.data_source[column_id]
981+
column_numpy_type = column.dtype
982+
column_max_size = column.astype("str").str.len().max()
983+
self.khiops_types[column_id] = get_khiops_type(
984+
column_numpy_type, column_max_size
985+
)
963986

964987
# Check key integrity
965988
self.check_key()

tests/test_dataset_class.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,17 @@ def create_monotable_dataframe(self):
100100
False,
101101
False,
102102
],
103+
# Make some entries longer than 100 characters in the "Title" column
104+
# to trigger the "Text" Khiops type assignment heuristic
103105
"Title": [
104-
"Awesome",
106+
(15 * "Awesome ").strip(),
105107
"Very lovely",
106108
"Some major design flaws",
107109
"My favorite buy!",
108-
"Flattering shirt",
110+
(7 * "Flattering shirt ").strip(),
109111
"Not for the very petite",
110112
"Cagrcoal shimmer fun",
111-
"Shimmer, surprisingly goes with lots",
113+
(3 * "Shimmer, surprisingly goes with lots ").strip(),
112114
"Flattering",
113115
"Such a fun dress!",
114116
],
@@ -128,6 +130,9 @@ def create_monotable_dataframe(self):
128130
],
129131
}
130132
dataset = pd.DataFrame(data)
133+
134+
# Force StringDType on "Title" to have it of "Text" Khiops type
135+
dataset["Title"] = dataset["Title"].astype("string")
131136
return dataset
132137

133138
def create_monotable_data_file(self, table_path):
@@ -294,7 +299,7 @@ def get_ref_var_types(self, multitable, schema=None):
294299
"Clothing ID": "Numerical",
295300
"Date": "Timestamp",
296301
"New": "Categorical",
297-
"Title": "Categorical",
302+
"Title": "Text",
298303
"Recommended IND": "Numerical",
299304
"Positive Feedback average": "Numerical",
300305
"class": "Categorical",
@@ -489,7 +494,9 @@ def test_out_file_from_dataframe_monotable(self):
489494

490495
# Create and load the intermediary Khiops file
491496
out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir)
492-
out_table = pd.read_csv(out_table_path, sep="\t")
497+
498+
# Force StringDType on the "Title" column upon CSV reading
499+
out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"})
493500

494501
# Cast "Date" columns to datetime as we don't automatically recognize dates
495502
out_table["Date"] = out_table["Date"].astype("datetime64[ns]")
@@ -799,7 +806,7 @@ def create_monotable_dataset_with_newlines():
799806
],
800807
"Age": [39],
801808
"Title": [
802-
"Shimmer,\nsurprisingly\n\rgoes with lots",
809+
(3 * "Shimmer,\nsurprisingly\n\rgoes with lots").strip(),
803810
],
804811
}
805812
dataset = pd.DataFrame(data)
@@ -814,7 +821,7 @@ def test_newlines_removed_from_csv_file_for_khiops(self):
814821
out_table = pd.read_csv(out_table_path, sep="\t")
815822

816823
self.assertEqual(
817-
"Shimmer, surprisingly goes with lots",
824+
(3 * "Shimmer, surprisingly goes with lots").strip(),
818825
out_table.Title[0],
819826
"Newlines should have been removed from the data",
820827
)

0 commit comments

Comments
 (0)