1313
1414import numpy as np
1515import pandas as pd
16- import sklearn
16+ from pandas . core . dtypes . common import is_numeric_dtype , is_string_dtype
1717from scipy import sparse as sp
1818from sklearn .utils import check_array
1919from sklearn .utils .validation import column_or_1d
3333# pylint --disable=all --enable=invalid-names dataset.py
3434# pylint: disable=invalid-name
3535
36+ # Set a special pandas option to force the new string data type (`StringDType`)
37+ # even for version 2.0 which is still required for python 3.10.
38+ # This new string data type does not map any longer to the corresponding numpy one
39+ # and will break the code unless a special care is taken
40+ pd .options .future .infer_string = True
41+
3642
3743def check_dataset_spec (ds_spec ):
3844 """Checks that a dataset spec is valid
@@ -393,16 +399,18 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393399
394400
395401def _column_or_1d_with_dtype (y , dtype = None ):
396- # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397- if sklearn .__version__ < "1.2" :
398- if pd .api .types .is_string_dtype (dtype ) and y .isin (["True" , "False" ]).all ():
399- warnings .warn (
400- "'y' stores strings restricted to 'True'/'False' values: "
401- "The predict method may return a bool vector."
402- )
403- return column_or_1d (y , warn = True )
404- else :
405- return column_or_1d (y , warn = True , dtype = dtype )
402+ """Checks the data is of the provided `dtype`.
403+ If a problem is detected a warning is printed or an error raised.
404+ """
405+
406+ # Since pandas 3.0 (and even in 2.0 if the option is activated)
407+ # a new StringDType is used to handle strings.
408+ # It does not match any longer the one recognized by numpy.
409+ # We need to force the translation to the numpy dtype
410+ # whenever a pandas string is detected (`is_string_dtype` returns `True`).
411+ if is_string_dtype (dtype ):
412+ dtype = np .dtype (str )
413+ return column_or_1d (y , warn = True , dtype = dtype )
406414
407415
408416class Dataset :
@@ -965,21 +973,23 @@ def __init__(self, name, dataframe, key=None):
965973
966974 # Initialize feature columns and verify their types
967975 self .column_ids = self .data_source .columns .values
968- if not np .issubdtype (self .column_ids .dtype , np .integer ):
969- if np .issubdtype (self .column_ids .dtype , object ):
970- for i , column_id in enumerate (self .column_ids ):
971- if not isinstance (column_id , str ):
972- raise TypeError (
973- f"Dataframe column ids must be either all integers or "
974- f"all strings. Column id at index { i } ('{ column_id } ') is"
975- f" of type '{ type (column_id ).__name__ } '"
976- )
977- else :
978- raise TypeError (
979- f"Dataframe column ids must be either all integers or "
980- f"all strings. The column index has dtype "
981- f"'{ self .column_ids .dtype } '"
982- )
976+ # Ensure the feature columns are either all string
977+ # or all numeric but not a mix of both.
978+ # Warning : the new pandas string data type (`StringDType`)
979+ # - by default in pandas 3.0 or forced in pandas 2.0 -
980+ # cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
981+ if not is_numeric_dtype (self .column_ids ) and not is_string_dtype (
982+ self .column_ids
983+ ):
984+ previous_type = None
985+ for i , column_id in enumerate (self .column_ids ):
986+ if previous_type is not None and type (column_id ) != previous_type :
987+ raise TypeError (
988+ f"Dataframe column ids must be either all integers or "
989+ f"all strings. Column id at index { i } ('{ column_id } ') is"
990+ f" of type '{ type (column_id ).__name__ } '"
991+ )
992+ previous_type = type (column_id )
983993
984994 # Initialize Khiops types
985995 self .khiops_types = {}
@@ -988,7 +998,8 @@ def __init__(self, name, dataframe, key=None):
988998 column_numpy_type = column .dtype
989999 column_max_size = None
9901000 if isinstance (column_numpy_type , pd .StringDtype ):
991- column_max_size = column .str .len ().max ()
1001+ # Warning pandas.Series.str.len() returns a float64
1002+ column_max_size = int (column .str .len ().max ())
9921003 self .khiops_types [column_id ] = get_khiops_type (
9931004 column_numpy_type , column_max_size
9941005 )
@@ -1161,7 +1172,7 @@ def __init__(self, name, matrix, key=None):
11611172 raise TypeError (
11621173 type_error_message ("matrix" , matrix , "scipy.sparse.spmatrix" )
11631174 )
1164- if not np . issubdtype (matrix .dtype , np . number ):
1175+ if not is_numeric_dtype (matrix .dtype ):
11651176 raise TypeError (
11661177 type_error_message ("'matrix' dtype" , matrix .dtype , "numeric" )
11671178 )
0 commit comments