Skip to content

Commit 465566c

Browse files
authored
Merge pull request #558 from KhiopsML/536-adapt-to-pandas3
Adapt to pandas 3.0
2 parents a2b0668 + d044263 commit 465566c

File tree

8 files changed

+84
-39
lines changed

8 files changed

+84
-39
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ pip install -U khiops
3737
Other installation methods are documented at the [Khiops website][khiops-install].
3838

3939
### Requirements
40-
- [Python][python] (>=3.8)
41-
- [Pandas][pandas] (>=0.25.3)
42-
- [Scikit-Learn][sklearn] (>=0.22.2)
40+
- [Python][python] (>=3.10)
41+
- [Pandas][pandas] (>=2.3.3)
42+
- [Scikit-Learn][sklearn] (>=1.7.2)
4343

4444
[pandas]: https://pandas.pydata.org
4545
[sklearn]: https://scikit-learn.org/stable

doc/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ ipykernel>=6.9.1
44
nbconvert==6.4.4
55
nbformat==5.3.0
66
numpydoc>=1.5.0
7-
pandas>=0.25.3,<=2.3.3
7+
pandas>=2.3.3,<4.0.0
88
scikit-learn>=1.7.2,<1.9.0
99
sphinx-copybutton>=0.5.0

khiops/sklearn/dataset.py

Lines changed: 68 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
import numpy as np
1515
import pandas as pd
16-
import sklearn
1716
from scipy import sparse as sp
1817
from sklearn.utils import check_array
1918
from sklearn.utils.validation import column_or_1d
@@ -33,6 +32,13 @@
3332
# pylint --disable=all --enable=invalid-names dataset.py
3433
# pylint: disable=invalid-name
3534

35+
# Set a special pandas option to force the new string data type (`StringDtype`)
36+
# even for version 2.0 which is still required for python 3.10.
37+
# This new string data type no longer maps to a NumPy data type.
38+
# Hence, code assuming NumPy type compatibility will break unless
39+
# this string data type is handled separately.
40+
pd.options.future.infer_string = True
41+
3642

3743
def check_dataset_spec(ds_spec):
3844
"""Checks that a dataset spec is valid
@@ -393,16 +399,11 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393399

394400

395401
def _column_or_1d_with_dtype(y, dtype=None):
396-
# 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397-
if sklearn.__version__ < "1.2":
398-
if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all():
399-
warnings.warn(
400-
"'y' stores strings restricted to 'True'/'False' values: "
401-
"The predict method may return a bool vector."
402-
)
403-
return column_or_1d(y, warn=True)
404-
else:
405-
return column_or_1d(y, warn=True, dtype=dtype)
402+
"""Checks the data is of the provided `dtype`.
403+
If a problem is detected a warning is printed or an error raised,
404+
otherwise the pandas object is transformed into a numpy.array
405+
"""
406+
return column_or_1d(y, warn=True, dtype=dtype)
406407

407408

408409
class Dataset:
@@ -607,16 +608,54 @@ def _init_target_column(self, y):
607608
# pandas.Series, pandas.DataFrame or numpy.ndarray
608609
else:
609610
if hasattr(y, "dtype"):
611+
if not isinstance(y, np.ndarray):
612+
# Since pandas 3.0, numbers and boolean values in an array
613+
# but with a carriage-return are wrongly inferred first
614+
# respectively as `object` dtype instead of `int64` and
615+
# `object` dtype instead of `bool`.
616+
# Forcing pandas to `infer_objects` fixes the error
617+
if pd.api.types.is_object_dtype(y):
618+
y = y.infer_objects()
619+
620+
# Since pandas 3.0 (and even in 2.0 if the option is activated)
621+
# a new `StringDtype` is used to handle strings.
622+
# It does not match any longer the one recognized by numpy.
623+
# An issue was created on scikit-learn
624+
# https://github.com/scikit-learn/scikit-learn/issues/33383
625+
# Until it is fixed, 'y' is not checked by
626+
# `_column_or_1d_with_dtype` when pandas dtype is `StringDtype`.
627+
610628
if isinstance(y.dtype, pd.CategoricalDtype):
611629
y_checked = _column_or_1d_with_dtype(
612-
y, dtype=y.dtype.categories.dtype
630+
y,
631+
dtype=(
632+
y.dtype.categories.dtype
633+
if not pd.api.types.is_string_dtype(
634+
y.dtype.categories.dtype
635+
)
636+
else None
637+
),
613638
)
614639
else:
615-
y_checked = _column_or_1d_with_dtype(y, dtype=y.dtype)
640+
y_checked = _column_or_1d_with_dtype(
641+
y,
642+
dtype=(
643+
y.dtype
644+
if not pd.api.types.is_string_dtype(y.dtype)
645+
else None
646+
),
647+
)
616648
elif hasattr(y, "dtypes"):
617649
if isinstance(y.dtypes.iloc[0], pd.CategoricalDtype):
618650
y_checked = _column_or_1d_with_dtype(
619-
y, dtype=y.dtypes.iloc[0].categories.dtype
651+
y,
652+
dtype=(
653+
y.dtypes.iloc[0].categories.dtype
654+
if not pd.api.types.is_string_dtype(
655+
y.dtypes.iloc[0].categories.dtype
656+
)
657+
else None
658+
),
620659
)
621660
else:
622661
y_checked = _column_or_1d_with_dtype(y)
@@ -965,21 +1004,16 @@ def __init__(self, name, dataframe, key=None):
9651004

9661005
# Initialize feature columns and verify their types
9671006
self.column_ids = self.data_source.columns.values
968-
if not np.issubdtype(self.column_ids.dtype, np.integer):
969-
if np.issubdtype(self.column_ids.dtype, object):
970-
for i, column_id in enumerate(self.column_ids):
971-
if not isinstance(column_id, str):
972-
raise TypeError(
973-
f"Dataframe column ids must be either all integers or "
974-
f"all strings. Column id at index {i} ('{column_id}') is"
975-
f" of type '{type(column_id).__name__}'"
976-
)
977-
else:
978-
raise TypeError(
979-
f"Dataframe column ids must be either all integers or "
980-
f"all strings. The column index has dtype "
981-
f"'{self.column_ids.dtype}'"
982-
)
1007+
# Ensure the feature columns are either all string
1008+
# or all numeric but not a mix of both.
1009+
if not pd.api.types.is_numeric_dtype(
1010+
self.column_ids
1011+
) and not pd.api.types.is_string_dtype(self.column_ids):
1012+
raise TypeError(
1013+
"Dataframe column ids must be either all integers or "
1014+
"all strings. Columns have the following mixed types: "
1015+
f"{sorted(set([type(cid).__name__ for cid in self.column_ids]))}."
1016+
)
9831017

9841018
# Initialize Khiops types
9851019
self.khiops_types = {}
@@ -988,7 +1022,11 @@ def __init__(self, name, dataframe, key=None):
9881022
column_numpy_type = column.dtype
9891023
column_max_size = None
9901024
if isinstance(column_numpy_type, pd.StringDtype):
991-
column_max_size = column.str.len().max()
1025+
# If a value is missing in column,
1026+
# column.str.len() would be typed as float64 instead of int64
1027+
# Until this is changed, the type is forced to int64
1028+
# cf https://github.com/pandas-dev/pandas/issues/51948
1029+
column_max_size = column.str.len().astype(pd.Int64Dtype()).max()
9921030
self.khiops_types[column_id] = get_khiops_type(
9931031
column_numpy_type, column_max_size
9941032
)

khiops/sklearn/estimators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,7 +2021,7 @@ def predict_proba(self, X):
20212021
y_probas, (pd.DataFrame, np.ndarray)
20222022
), "y_probas is not a Pandas DataFrame nor Numpy array"
20232023
y_probas = y_probas.reindex(
2024-
self._sorted_prob_variable_names(), axis=1, copy=False
2024+
self._sorted_prob_variable_names(), axis=1
20252025
).to_numpy(copy=False)
20262026

20272027
assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray"
@@ -2265,7 +2265,7 @@ def predict(self, X):
22652265

22662266
# Transform to np.ndarray
22672267
if isinstance(y_pred, pd.DataFrame):
2268-
y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel()
2268+
y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel()
22692269

22702270
assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array"
22712271
return y_pred

packaging/conda/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ requirements:
2424
run:
2525
- python
2626
- khiops-core =11.0.0
27-
- pandas >=0.25.3,<=2.3.3
27+
- pandas >=2.3.3,<4.0.0
2828
- scikit-learn>=1.7.2,<1.9.0
2929
run_constrained:
3030
# do not necessary use the latest version

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ classifiers = [
105105
requires-python = ">=3.8"
106106
dependencies = [
107107
# do not use the latest versions, to avoid undesired breaking changes
108-
"pandas>=0.25.3,<=2.3.3",
108+
"pandas>=2.3.3,<4.0.0",
109109
"scikit-learn>=1.7.2,<1.9.0",
110110
]
111111

tests/test_dataset_class.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,13 @@ def test_out_file_from_dataframe_monotable(self):
503503
ref_table = spec["main_table"][0]
504504
ref_table["class"] = y
505505

506+
# Since pandas 3.0 the default precision for parsing a datetime
507+
# is now microseconds (us) instead of nanoseconds (ns)
508+
# unless enough precision is given.
509+
# Unfortunately only the changelog states this, not the docstring.
510+
# To avoid any comparison error in tests
511+
# we need set the required precision (ns) to the datetime
512+
ref_table["Date"] = ref_table["Date"].astype("datetime64[ns]")
506513
# Check that the dataframes are equal
507514
assert_frame_equal(
508515
ref_table.sort_values(by="User_ID").reset_index(drop=True),

tests/test_dataset_errors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):
595595
output_error_msg = str(context.exception)
596596
expected_msg = (
597597
"Dataframe column ids must be either all integers or all "
598-
"strings. Column id at index 0 ('1') is of type 'int'"
598+
"strings. Columns have the following mixed types: ['int', 'str']."
599599
)
600600
self.assertEqual(output_error_msg, expected_msg)

0 commit comments

Comments
 (0)