Adapt to pandas 3.0

Thierry RAMORASOAVINA · Thierry RAMORASOAVINA · commit 0162ef921ad7 · 2026-02-23T12:03:54.000+01:00
- For python 3.10, 2.3.3 is still used but with the new pandas StringDtype enabled
- For python 3.11+, the later 3.0.0+ versions are used
diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -4,6 +4,6 @@ ipykernel>=6.9.1
 nbconvert==6.4.4
 nbformat==5.3.0
 numpydoc>=1.5.0
-pandas>=0.25.3,<=2.3.3
+pandas>=2.3.3,<=4.0.0
 scikit-learn>=1.7.2,<1.9.0
 sphinx-copybutton>=0.5.0
diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py
@@ -13,7 +13,7 @@
 
 import numpy as np
 import pandas as pd
-import sklearn
+from pandas.core.dtypes.common import is_numeric_dtype, is_string_dtype
 from scipy import sparse as sp
 from sklearn.utils import check_array
 from sklearn.utils.validation import column_or_1d
@@ -33,6 +33,12 @@
 #   pylint --disable=all --enable=invalid-names dataset.py
 # pylint: disable=invalid-name
 
+# Set a special pandas option to force the new string data type (`StringDType`)
+# even for version 2.0 which is still required for python 3.10.
+# This new string data type does not map any longer to the corresponding numpy one
+# and will break the code unless a special care is taken
+pd.options.future.infer_string = True
+
 
 def check_dataset_spec(ds_spec):
     """Checks that a dataset spec is valid
@@ -393,16 +399,19 @@ def write_internal_data_table(dataframe, file_path_or_stream):
 
 
 def _column_or_1d_with_dtype(y, dtype=None):
-    # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
-    if sklearn.__version__ < "1.2":
-        if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all():
-            warnings.warn(
-                "'y' stores strings restricted to 'True'/'False' values: "
-                "The predict method may return a bool vector."
-            )
-        return column_or_1d(y, warn=True)
-    else:
-        return column_or_1d(y, warn=True, dtype=dtype)
+    """Checks the data is of the provided `dtype`.
+    If a problem is detected a warning is printed or an error raised,
+    otherwise the pandas object is transformed into a numpy.array
+    """
+
+    # Since pandas 3.0 (and even in 2.0 if the option is activated)
+    # a new StringDType is used to handle strings.
+    # It does not match any longer the one recognized by numpy.
+    # We need to force the translation to the numpy dtype
+    # whenever a pandas string is detected (`is_string_dtype` returns `True`).
+    if is_string_dtype(dtype):
+        dtype = np.dtype(str)
+    return column_or_1d(y, warn=True, dtype=dtype)
 
 
 class Dataset:
@@ -965,21 +974,23 @@ def __init__(self, name, dataframe, key=None):
 
         # Initialize feature columns and verify their types
         self.column_ids = self.data_source.columns.values
-        if not np.issubdtype(self.column_ids.dtype, np.integer):
-            if np.issubdtype(self.column_ids.dtype, object):
-                for i, column_id in enumerate(self.column_ids):
-                    if not isinstance(column_id, str):
-                        raise TypeError(
-                            f"Dataframe column ids must be either all integers or "
-                            f"all strings. Column id at index {i} ('{column_id}') is"
-                            f" of type '{type(column_id).__name__}'"
-                        )
-            else:
-                raise TypeError(
-                    f"Dataframe column ids must be either all integers or "
-                    f"all strings. The column index has dtype "
-                    f"'{self.column_ids.dtype}'"
-                )
+        # Ensure the feature columns are either all string
+        # or all numeric but not a mix of both.
+        # Warning : the new pandas string data type (`StringDType`)
+        # - by default in pandas 3.0 or forced in pandas 2.0 -
+        # cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
+        if not is_numeric_dtype(self.column_ids) and not is_string_dtype(
+            self.column_ids
+        ):
+            previous_type = None
+            for i, column_id in enumerate(self.column_ids):
+                if previous_type is not None and type(column_id) != previous_type:
+                    raise TypeError(
+                        f"Dataframe column ids must be either all integers or "
+                        f"all strings. Column id at index {i} ('{column_id}') is"
+                        f" of type '{type(column_id).__name__}'"
+                    )
+                previous_type = type(column_id)
 
         # Initialize Khiops types
         self.khiops_types = {}
@@ -988,7 +999,8 @@ def __init__(self, name, dataframe, key=None):
             column_numpy_type = column.dtype
             column_max_size = None
             if isinstance(column_numpy_type, pd.StringDtype):
-                column_max_size = column.str.len().max()
+                # Warning pandas.Series.str.len() returns a float64
+                column_max_size = int(column.str.len().max())
             self.khiops_types[column_id] = get_khiops_type(
                 column_numpy_type, column_max_size
             )
@@ -1161,7 +1173,7 @@ def __init__(self, name, matrix, key=None):
             raise TypeError(
                 type_error_message("matrix", matrix, "scipy.sparse.spmatrix")
             )
-        if not np.issubdtype(matrix.dtype, np.number):
+        if not is_numeric_dtype(matrix.dtype):
             raise TypeError(
                 type_error_message("'matrix' dtype", matrix.dtype, "numeric")
             )
diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -2021,7 +2021,7 @@ def predict_proba(self, X):
             y_probas, (pd.DataFrame, np.ndarray)
         ), "y_probas is not a Pandas DataFrame nor Numpy array"
         y_probas = y_probas.reindex(
-            self._sorted_prob_variable_names(), axis=1, copy=False
+            self._sorted_prob_variable_names(), axis=1
         ).to_numpy(copy=False)
 
         assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray"
@@ -2265,7 +2265,7 @@ def predict(self, X):
 
         # Transform to np.ndarray
         if isinstance(y_pred, pd.DataFrame):
-            y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel()
+            y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel()
 
         assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array"
         return y_pred
diff --git a/packaging/conda/meta.yaml b/packaging/conda/meta.yaml
@@ -24,7 +24,7 @@ requirements:
   run:
     - python
     - khiops-core =11.0.0
-    - pandas >=0.25.3,<=2.3.3
+    - pandas >=2.3.3,<=4.0.0
     - scikit-learn>=1.7.2,<1.9.0
   run_constrained:
     # do not necessary use the latest version
diff --git a/pyproject.toml b/pyproject.toml
@@ -105,7 +105,7 @@ classifiers = [
 requires-python = ">=3.8"
 dependencies = [
     # do not use the latest versions, to avoid undesired breaking changes
-    "pandas>=0.25.3,<=2.3.3",
+    "pandas>=2.3.3,<=4.0.0",
     "scikit-learn>=1.7.2,<1.9.0",
 ]
 
diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py
@@ -74,18 +74,24 @@ def create_monotable_dataframe(self):
                 1077,
                 1077,
             ],
+            # Since pandas 3.0 the default precision for parsing a datetime
+            # is now microseconds (us) instead of nanoseconds (ns)
+            # unless enough precision is given.
+            # Unfortunately only the changelog states this, not the docstring.
+            # To avoid any comparison error in tests
+            # we need to add the required precision to the datetime
             "Date": pd.to_datetime(
                 [
-                    "2019-03-22",
-                    "2019-03-23",
-                    "2019-03-24",
-                    "2019-03-25",
-                    "2019-03-26",
-                    "2019-03-27",
-                    "2019-03-28",
-                    "2019-03-29",
-                    "2019-03-30",
-                    "2019-03-31",
+                    "2019-03-22 00:00:00.123456789",
+                    "2019-03-23 00:00:00.123456789",
+                    "2019-03-24 00:00:00.123456789",
+                    "2019-03-25 00:00:00.123456789",
+                    "2019-03-26 00:00:00.123456789",
+                    "2019-03-27 00:00:00.123456789",
+                    "2019-03-28 00:00:00.123456789",
+                    "2019-03-29 00:00:00.123456789",
+                    "2019-03-30 00:00:00.123456789",
+                    "2019-03-31 00:00:00.123456789",
                 ],
             ),
             "New": [
@@ -499,6 +505,12 @@ def test_out_file_from_dataframe_monotable(self):
         out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"})
 
         # Cast "Date" columns to datetime as we don't automatically recognize dates
+        # Since pandas 3.0 the default precision for parsing a datetime
+        # is now microseconds (us) instead of nanoseconds (ns)
+        # unless enough precision is given.
+        # Unfortunately only the changelog states this, not the docstring.
+        # To avoid any comparison error in tests
+        # we need to add the required precision to the datetime
         out_table["Date"] = out_table["Date"].astype("datetime64[ns]")
         ref_table = spec["main_table"][0]
         ref_table["class"] = y
diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py
@@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):
         output_error_msg = str(context.exception)
         expected_msg = (
             "Dataframe column ids must be either all integers or all "
-            "strings. Column id at index 0 ('1') is of type 'int'"
+            "strings. Column id at index 1 ('Age') is of type 'str'"
         )
         self.assertEqual(output_error_msg, expected_msg)
diff --git a/tests/test_helper.py b/tests/test_helper.py
@@ -319,8 +319,11 @@ def prepare_data(data, target_variable, primary_table=None, y_as_dataframe=False
                 data, test_size=0.3, random_state=1, shuffle=False
             )
 
-            y_test = data_test[target_variable]
-            y_train = data_train[target_variable]
+            # Since pandas 3.0, numbers in an array but with a carriage-return
+            # are lazily and wrongly inferred as `object` dtype instead of `int64`
+            # forcing pandas to `infer_objects` fixes the error
+            y_test = data_test.infer_objects()[target_variable]
+            y_train = data_train.infer_objects()[target_variable]
 
             # Create training labels as single-column dataframe
             if y_as_dataframe:

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ classifiers = [`
`105`	`105`	`requires-python = ">=3.8"`
`106`	`106`	`dependencies = [`
`107`	`107`	`# do not use the latest versions, to avoid undesired breaking changes`
`108`		`- "pandas>=0.25.3,<=2.3.3",`
	`108`	`+ "pandas>=2.3.3,<=4.0.0",`
`109`	`109`	`"scikit-learn>=1.7.2,<1.9.0",`
`110`	`110`	`]`
`111`	`111`
Original file line number	Diff line number	Diff line change
`@@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):`
`595`	`595`	`output_error_msg = str(context.exception)`
`596`	`596`	`expected_msg = (`
`597`	`597`	`"Dataframe column ids must be either all integers or all "`
`598`		`- "strings. Column id at index 0 ('1') is of type 'int'"`
	`598`	`+ "strings. Column id at index 1 ('Age') is of type 'str'"`
`599`	`599`	`)`
`600`	`600`	`self.assertEqual(output_error_msg, expected_msg)`