Skip to content

Commit 0162ef9

Browse files
author
Thierry RAMORASOAVINA
committed
Adapt to pandas 3.0
- For python 3.10, 2.3.3 is still used but with the new pandas StringDtype enabled - For python 3.11+, the later 3.0.0+ versions are used
1 parent a2b0668 commit 0162ef9

File tree

8 files changed

+73
-46
lines changed

8 files changed

+73
-46
lines changed

doc/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ ipykernel>=6.9.1
44
nbconvert==6.4.4
55
nbformat==5.3.0
66
numpydoc>=1.5.0
7-
pandas>=0.25.3,<=2.3.3
7+
pandas>=2.3.3,<=4.0.0
88
scikit-learn>=1.7.2,<1.9.0
99
sphinx-copybutton>=0.5.0

khiops/sklearn/dataset.py

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
import numpy as np
1515
import pandas as pd
16-
import sklearn
16+
from pandas.core.dtypes.common import is_numeric_dtype, is_string_dtype
1717
from scipy import sparse as sp
1818
from sklearn.utils import check_array
1919
from sklearn.utils.validation import column_or_1d
@@ -33,6 +33,12 @@
3333
# pylint --disable=all --enable=invalid-names dataset.py
3434
# pylint: disable=invalid-name
3535

36+
# Set a special pandas option to force the new string data type (`StringDType`)
37+
# even for version 2.0 which is still required for python 3.10.
38+
# This new string data type does not map any longer to the corresponding numpy one
39+
# and will break the code unless a special care is taken
40+
pd.options.future.infer_string = True
41+
3642

3743
def check_dataset_spec(ds_spec):
3844
"""Checks that a dataset spec is valid
@@ -393,16 +399,19 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393399

394400

395401
def _column_or_1d_with_dtype(y, dtype=None):
396-
# 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397-
if sklearn.__version__ < "1.2":
398-
if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all():
399-
warnings.warn(
400-
"'y' stores strings restricted to 'True'/'False' values: "
401-
"The predict method may return a bool vector."
402-
)
403-
return column_or_1d(y, warn=True)
404-
else:
405-
return column_or_1d(y, warn=True, dtype=dtype)
402+
"""Checks the data is of the provided `dtype`.
403+
If a problem is detected a warning is printed or an error raised,
404+
otherwise the pandas object is transformed into a numpy.array
405+
"""
406+
407+
# Since pandas 3.0 (and even in 2.0 if the option is activated)
408+
# a new StringDType is used to handle strings.
409+
# It does not match any longer the one recognized by numpy.
410+
# We need to force the translation to the numpy dtype
411+
# whenever a pandas string is detected (`is_string_dtype` returns `True`).
412+
if is_string_dtype(dtype):
413+
dtype = np.dtype(str)
414+
return column_or_1d(y, warn=True, dtype=dtype)
406415

407416

408417
class Dataset:
@@ -965,21 +974,23 @@ def __init__(self, name, dataframe, key=None):
965974

966975
# Initialize feature columns and verify their types
967976
self.column_ids = self.data_source.columns.values
968-
if not np.issubdtype(self.column_ids.dtype, np.integer):
969-
if np.issubdtype(self.column_ids.dtype, object):
970-
for i, column_id in enumerate(self.column_ids):
971-
if not isinstance(column_id, str):
972-
raise TypeError(
973-
f"Dataframe column ids must be either all integers or "
974-
f"all strings. Column id at index {i} ('{column_id}') is"
975-
f" of type '{type(column_id).__name__}'"
976-
)
977-
else:
978-
raise TypeError(
979-
f"Dataframe column ids must be either all integers or "
980-
f"all strings. The column index has dtype "
981-
f"'{self.column_ids.dtype}'"
982-
)
977+
# Ensure the feature columns are either all string
978+
# or all numeric but not a mix of both.
979+
# Warning : the new pandas string data type (`StringDType`)
980+
# - by default in pandas 3.0 or forced in pandas 2.0 -
981+
# cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
982+
if not is_numeric_dtype(self.column_ids) and not is_string_dtype(
983+
self.column_ids
984+
):
985+
previous_type = None
986+
for i, column_id in enumerate(self.column_ids):
987+
if previous_type is not None and type(column_id) != previous_type:
988+
raise TypeError(
989+
f"Dataframe column ids must be either all integers or "
990+
f"all strings. Column id at index {i} ('{column_id}') is"
991+
f" of type '{type(column_id).__name__}'"
992+
)
993+
previous_type = type(column_id)
983994

984995
# Initialize Khiops types
985996
self.khiops_types = {}
@@ -988,7 +999,8 @@ def __init__(self, name, dataframe, key=None):
988999
column_numpy_type = column.dtype
9891000
column_max_size = None
9901001
if isinstance(column_numpy_type, pd.StringDtype):
991-
column_max_size = column.str.len().max()
1002+
# Warning pandas.Series.str.len() returns a float64
1003+
column_max_size = int(column.str.len().max())
9921004
self.khiops_types[column_id] = get_khiops_type(
9931005
column_numpy_type, column_max_size
9941006
)
@@ -1161,7 +1173,7 @@ def __init__(self, name, matrix, key=None):
11611173
raise TypeError(
11621174
type_error_message("matrix", matrix, "scipy.sparse.spmatrix")
11631175
)
1164-
if not np.issubdtype(matrix.dtype, np.number):
1176+
if not is_numeric_dtype(matrix.dtype):
11651177
raise TypeError(
11661178
type_error_message("'matrix' dtype", matrix.dtype, "numeric")
11671179
)

khiops/sklearn/estimators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,7 +2021,7 @@ def predict_proba(self, X):
20212021
y_probas, (pd.DataFrame, np.ndarray)
20222022
), "y_probas is not a Pandas DataFrame nor Numpy array"
20232023
y_probas = y_probas.reindex(
2024-
self._sorted_prob_variable_names(), axis=1, copy=False
2024+
self._sorted_prob_variable_names(), axis=1
20252025
).to_numpy(copy=False)
20262026

20272027
assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray"
@@ -2265,7 +2265,7 @@ def predict(self, X):
22652265

22662266
# Transform to np.ndarray
22672267
if isinstance(y_pred, pd.DataFrame):
2268-
y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel()
2268+
y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel()
22692269

22702270
assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array"
22712271
return y_pred

packaging/conda/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ requirements:
2424
run:
2525
- python
2626
- khiops-core =11.0.0
27-
- pandas >=0.25.3,<=2.3.3
27+
- pandas >=2.3.3,<=4.0.0
2828
- scikit-learn>=1.7.2,<1.9.0
2929
run_constrained:
3030
# do not necessary use the latest version

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ classifiers = [
105105
requires-python = ">=3.8"
106106
dependencies = [
107107
# do not use the latest versions, to avoid undesired breaking changes
108-
"pandas>=0.25.3,<=2.3.3",
108+
"pandas>=2.3.3,<=4.0.0",
109109
"scikit-learn>=1.7.2,<1.9.0",
110110
]
111111

tests/test_dataset_class.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,24 @@ def create_monotable_dataframe(self):
7474
1077,
7575
1077,
7676
],
77+
# Since pandas 3.0 the default precision for parsing a datetime
78+
# is now microseconds (us) instead of nanoseconds (ns)
79+
# unless enough precision is given.
80+
# Unfortunately only the changelog states this, not the docstring.
81+
# To avoid any comparison error in tests
82+
# we need to add the required precision to the datetime
7783
"Date": pd.to_datetime(
7884
[
79-
"2019-03-22",
80-
"2019-03-23",
81-
"2019-03-24",
82-
"2019-03-25",
83-
"2019-03-26",
84-
"2019-03-27",
85-
"2019-03-28",
86-
"2019-03-29",
87-
"2019-03-30",
88-
"2019-03-31",
85+
"2019-03-22 00:00:00.123456789",
86+
"2019-03-23 00:00:00.123456789",
87+
"2019-03-24 00:00:00.123456789",
88+
"2019-03-25 00:00:00.123456789",
89+
"2019-03-26 00:00:00.123456789",
90+
"2019-03-27 00:00:00.123456789",
91+
"2019-03-28 00:00:00.123456789",
92+
"2019-03-29 00:00:00.123456789",
93+
"2019-03-30 00:00:00.123456789",
94+
"2019-03-31 00:00:00.123456789",
8995
],
9096
),
9197
"New": [
@@ -499,6 +505,12 @@ def test_out_file_from_dataframe_monotable(self):
499505
out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"})
500506

501507
# Cast "Date" columns to datetime as we don't automatically recognize dates
508+
# Since pandas 3.0 the default precision for parsing a datetime
509+
# is now microseconds (us) instead of nanoseconds (ns)
510+
# unless enough precision is given.
511+
# Unfortunately only the changelog states this, not the docstring.
512+
# To avoid any comparison error in tests
513+
# we need to add the required precision to the datetime
502514
out_table["Date"] = out_table["Date"].astype("datetime64[ns]")
503515
ref_table = spec["main_table"][0]
504516
ref_table["class"] = y

tests/test_dataset_errors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):
595595
output_error_msg = str(context.exception)
596596
expected_msg = (
597597
"Dataframe column ids must be either all integers or all "
598-
"strings. Column id at index 0 ('1') is of type 'int'"
598+
"strings. Column id at index 1 ('Age') is of type 'str'"
599599
)
600600
self.assertEqual(output_error_msg, expected_msg)

tests/test_helper.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,8 +319,11 @@ def prepare_data(data, target_variable, primary_table=None, y_as_dataframe=False
319319
data, test_size=0.3, random_state=1, shuffle=False
320320
)
321321

322-
y_test = data_test[target_variable]
323-
y_train = data_train[target_variable]
322+
# Since pandas 3.0, numbers in an array but with a carriage-return
323+
# are lazily and wrongly inferred as `object` dtype instead of `int64`
324+
# forcing pandas to `infer_objects` fixes the error
325+
y_test = data_test.infer_objects()[target_variable]
326+
y_train = data_train.infer_objects()[target_variable]
324327

325328
# Create training labels as single-column dataframe
326329
if y_as_dataframe:

0 commit comments

Comments
 (0)