1313
1414import numpy as np
1515import pandas as pd
16- import sklearn
1716from scipy import sparse as sp
1817from sklearn .utils import check_array
1918from sklearn .utils .validation import column_or_1d
3332# pylint --disable=all --enable=invalid-names dataset.py
3433# pylint: disable=invalid-name
3534
35+ # Set a special pandas option to force the new string data type (`StringDType`)
36+ # even for version 2.0 which is still required for python 3.10.
37+ # This new string data type does not map any longer to the corresponding numpy one
38+ # and will break the code unless a special care is taken
39+ pd .options .future .infer_string = True
40+
3641
3742def check_dataset_spec (ds_spec ):
3843 """Checks that a dataset spec is valid
@@ -393,16 +398,19 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393398
394399
395400def _column_or_1d_with_dtype (y , dtype = None ):
396- # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397- if sklearn .__version__ < "1.2" :
398- if pd .api .types .is_string_dtype (dtype ) and y .isin (["True" , "False" ]).all ():
399- warnings .warn (
400- "'y' stores strings restricted to 'True'/'False' values: "
401- "The predict method may return a bool vector."
402- )
403- return column_or_1d (y , warn = True )
404- else :
405- return column_or_1d (y , warn = True , dtype = dtype )
401+ """Checks the data is of the provided `dtype`.
402+ If a problem is detected a warning is printed or an error raised,
403+ otherwise the pandas object is transformed into a numpy.array
404+ """
405+
406+ # Since pandas 3.0 (and even in 2.0 if the option is activated)
407+ # a new StringDType is used to handle strings.
408+ # It does not match any longer the one recognized by numpy.
409+ # We need to force the translation to the numpy dtype
410+ # whenever a pandas string is detected (`is_string_dtype` returns `True`).
411+ if pd .api .types .is_string_dtype (dtype ):
412+ dtype = np .dtype (str )
413+ return column_or_1d (y , warn = True , dtype = dtype )
406414
407415
408416class Dataset :
@@ -607,6 +615,21 @@ def _init_target_column(self, y):
607615 # pandas.Series, pandas.DataFrame or numpy.ndarray
608616 else :
609617 if hasattr (y , "dtype" ):
618+ if not isinstance (y , np .ndarray ):
619+ # Since pandas 3.0, numbers and boolean values in an array
620+ # but with a carriage-return are wrongly inferred first
621+ # respectively as `object` dtype instead of `int64` and
622+ # `object` dtype instead of `bool`.
623+ # Forcing pandas to `infer_objects` fixes the error
624+ if pd .api .types .is_object_dtype (y ):
625+ y = y .infer_objects ()
626+ warnings .warn (
627+ "The first guess of 'y' dtype is 'object'. "
628+ "This would lead to errors. "
629+ "After a second pass of inferring, "
630+ "the detected dtype is {}" ,
631+ y .dtype ,
632+ )
610633 if isinstance (y .dtype , pd .CategoricalDtype ):
611634 y_checked = _column_or_1d_with_dtype (
612635 y , dtype = y .dtype .categories .dtype
@@ -965,21 +988,23 @@ def __init__(self, name, dataframe, key=None):
965988
966989 # Initialize feature columns and verify their types
967990 self .column_ids = self .data_source .columns .values
968- if not np .issubdtype (self .column_ids .dtype , np .integer ):
969- if np .issubdtype (self .column_ids .dtype , object ):
970- for i , column_id in enumerate (self .column_ids ):
971- if not isinstance (column_id , str ):
972- raise TypeError (
973- f"Dataframe column ids must be either all integers or "
974- f"all strings. Column id at index { i } ('{ column_id } ') is"
975- f" of type '{ type (column_id ).__name__ } '"
976- )
977- else :
978- raise TypeError (
979- f"Dataframe column ids must be either all integers or "
980- f"all strings. The column index has dtype "
981- f"'{ self .column_ids .dtype } '"
982- )
991+ # Ensure the feature columns are either all string
992+ # or all numeric but not a mix of both.
993+ # Warning : the new pandas string data type (`StringDType`)
994+ # - by default in pandas 3.0 or forced in pandas 2.0 -
995+ # cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
996+ if not pd .api .types .is_numeric_dtype (
997+ self .column_ids
998+ ) and not pd .api .types .is_string_dtype (self .column_ids ):
999+ previous_type = None
1000+ for i , column_id in enumerate (self .column_ids ):
1001+ if previous_type is not None and type (column_id ) != previous_type :
1002+ raise TypeError (
1003+ f"Dataframe column ids must be either all integers or "
1004+ f"all strings. Column id at index { i } ('{ column_id } ') is"
1005+ f" of type '{ type (column_id ).__name__ } '"
1006+ )
1007+ previous_type = type (column_id )
9831008
9841009 # Initialize Khiops types
9851010 self .khiops_types = {}
@@ -988,7 +1013,8 @@ def __init__(self, name, dataframe, key=None):
9881013 column_numpy_type = column .dtype
9891014 column_max_size = None
9901015 if isinstance (column_numpy_type , pd .StringDtype ):
991- column_max_size = column .str .len ().max ()
1016+ # Warning pandas.Series.str.len() returns a float64
1017+ column_max_size = int (column .str .len ().max ())
9921018 self .khiops_types [column_id ] = get_khiops_type (
9931019 column_numpy_type , column_max_size
9941020 )
@@ -1161,7 +1187,7 @@ def __init__(self, name, matrix, key=None):
11611187 raise TypeError (
11621188 type_error_message ("matrix" , matrix , "scipy.sparse.spmatrix" )
11631189 )
1164- if not np . issubdtype (matrix .dtype , np . number ):
1190+ if not pd . api . types . is_numeric_dtype (matrix .dtype ):
11651191 raise TypeError (
11661192 type_error_message ("'matrix' dtype" , matrix .dtype , "numeric" )
11671193 )
0 commit comments