1313
1414import numpy as np
1515import pandas as pd
16- import sklearn
1716from scipy import sparse as sp
1817from sklearn .utils import check_array
1918from sklearn .utils .validation import column_or_1d
3332# pylint --disable=all --enable=invalid-names dataset.py
3433# pylint: disable=invalid-name
3534
35+ # Set a special pandas option to force the new string data type (`StringDtype`)
36+ # even for version 2.0 which is still required for python 3.10.
37+ # This new string data type no longer maps to a NumPy data type.
38+ # Hence, code assuming NumPy type compatibility will break unless
39+ # this string data type is handled separately.
40+ pd .options .future .infer_string = True
41+
3642
3743def check_dataset_spec (ds_spec ):
3844 """Checks that a dataset spec is valid
@@ -393,16 +399,11 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393399
394400
395401def _column_or_1d_with_dtype (y , dtype = None ):
396- # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397- if sklearn .__version__ < "1.2" :
398- if pd .api .types .is_string_dtype (dtype ) and y .isin (["True" , "False" ]).all ():
399- warnings .warn (
400- "'y' stores strings restricted to 'True'/'False' values: "
401- "The predict method may return a bool vector."
402- )
403- return column_or_1d (y , warn = True )
404- else :
405- return column_or_1d (y , warn = True , dtype = dtype )
402+ """Checks the data is of the provided `dtype`.
403+ If a problem is detected a warning is printed or an error raised,
404+ otherwise the pandas object is transformed into a numpy.array
405+ """
406+ return column_or_1d (y , warn = True , dtype = dtype )
406407
407408
408409class Dataset :
@@ -607,12 +608,37 @@ def _init_target_column(self, y):
607608 # pandas.Series, pandas.DataFrame or numpy.ndarray
608609 else :
609610 if hasattr (y , "dtype" ):
611+ if not isinstance (y , np .ndarray ):
612+ # Since pandas 3.0, numbers and boolean values in an array
613+ # but with a carriage-return are wrongly inferred first
614+ # respectively as `object` dtype instead of `int64` and
615+ # `object` dtype instead of `bool`.
616+ # Forcing pandas to `infer_objects` fixes the error
617+ if pd .api .types .is_object_dtype (y ):
618+ y = y .infer_objects ()
619+ warnings .warn (
620+ "The first guess of 'y' dtype is 'object'. "
621+ "This would lead to errors. "
622+ "After a second pass of inferring, "
623+ f"the detected dtype is { y .dtype } "
624+ )
610625 if isinstance (y .dtype , pd .CategoricalDtype ):
611626 y_checked = _column_or_1d_with_dtype (
612627 y , dtype = y .dtype .categories .dtype
613628 )
614629 else :
615- y_checked = _column_or_1d_with_dtype (y , dtype = y .dtype )
630+ # Since pandas 3.0 (and even in 2.0 if the option is activated)
631+ # a new `StringDtype` is used to handle strings.
632+ # It does not match any longer the one recognized by numpy.
633+ # An issue was created on scikit-learn
634+ # https://github.com/scikit-learn/scikit-learn/issues/33383
635+ # Until it is fixed 'y' is not checked when pandas dtype is
636+ # `StringDtype`.
637+ if pd .api .types .is_string_dtype (y .dtype ):
638+ dtype = None
639+ else :
640+ dtype = y .dtype
641+ y_checked = _column_or_1d_with_dtype (y , dtype = dtype )
616642 elif hasattr (y , "dtypes" ):
617643 if isinstance (y .dtypes .iloc [0 ], pd .CategoricalDtype ):
618644 y_checked = _column_or_1d_with_dtype (
@@ -965,21 +991,19 @@ def __init__(self, name, dataframe, key=None):
965991
966992 # Initialize feature columns and verify their types
967993 self .column_ids = self .data_source .columns .values
968- if not np .issubdtype (self .column_ids .dtype , np .integer ):
969- if np .issubdtype (self .column_ids .dtype , object ):
970- for i , column_id in enumerate (self .column_ids ):
971- if not isinstance (column_id , str ):
972- raise TypeError (
973- f"Dataframe column ids must be either all integers or "
974- f"all strings. Column id at index { i } ('{ column_id } ') is"
975- f" of type '{ type (column_id ).__name__ } '"
976- )
977- else :
978- raise TypeError (
979- f"Dataframe column ids must be either all integers or "
980- f"all strings. The column index has dtype "
981- f"'{ self .column_ids .dtype } '"
982- )
994+ # Ensure the feature columns are either all string
995+ # or all numeric but not a mix of both.
996+ # Warning : the new pandas string data type (`StringDType`)
997+ # - by default in pandas 3.0 or forced in pandas 2.0 -
998+ # cannot be evaluated by `np.issubdtype`, any attempt will raise an error.
999+ if not pd .api .types .is_numeric_dtype (
1000+ self .column_ids
1001+ ) and not pd .api .types .is_string_dtype (self .column_ids ):
1002+ raise TypeError (
1003+ "Dataframe column ids must be either all integers or "
1004+ "all strings. Columns have the following mixed types: "
1005+ f"{ sorted (set ([type (cid ).__name__ for cid in self .column_ids ]))} ."
1006+ )
9831007
9841008 # Initialize Khiops types
9851009 self .khiops_types = {}
@@ -988,7 +1012,8 @@ def __init__(self, name, dataframe, key=None):
9881012 column_numpy_type = column .dtype
9891013 column_max_size = None
9901014 if isinstance (column_numpy_type , pd .StringDtype ):
991- column_max_size = column .str .len ().max ()
1015+ # Warning pandas.Series.str.len() returns a float64
1016+ column_max_size = int (column .str .len ().max ())
9921017 self .khiops_types [column_id ] = get_khiops_type (
9931018 column_numpy_type , column_max_size
9941019 )
0 commit comments