1313
1414import numpy as np
1515import pandas as pd
16- import sklearn
1716from scipy import sparse as sp
1817from sklearn .utils import check_array
1918from sklearn .utils .validation import column_or_1d
3332# pylint --disable=all --enable=invalid-names dataset.py
3433# pylint: disable=invalid-name
3534
35+ # Set a special pandas option to force the new string data type (`StringDtype`)
36+ # even for version 2.0 which is still required for python 3.10.
37+ # This new string data type no longer maps to a NumPy data type.
38+ # Hence, code assuming NumPy type compatibility will break unless
39+ # this string data type is handled separately.
40+ pd .options .future .infer_string = True
41+
3642
3743def check_dataset_spec (ds_spec ):
3844 """Checks that a dataset spec is valid
@@ -393,16 +399,11 @@ def write_internal_data_table(dataframe, file_path_or_stream):
393399
394400
395401def _column_or_1d_with_dtype (y , dtype = None ):
396- # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
397- if sklearn .__version__ < "1.2" :
398- if pd .api .types .is_string_dtype (dtype ) and y .isin (["True" , "False" ]).all ():
399- warnings .warn (
400- "'y' stores strings restricted to 'True'/'False' values: "
401- "The predict method may return a bool vector."
402- )
403- return column_or_1d (y , warn = True )
404- else :
405- return column_or_1d (y , warn = True , dtype = dtype )
402+ """Checks the data is of the provided `dtype`.
403+ If a problem is detected a warning is printed or an error raised,
404+ otherwise the pandas object is transformed into a numpy.array
405+ """
406+ return column_or_1d (y , warn = True , dtype = dtype )
406407
407408
408409class Dataset :
@@ -607,16 +608,54 @@ def _init_target_column(self, y):
607608 # pandas.Series, pandas.DataFrame or numpy.ndarray
608609 else :
609610 if hasattr (y , "dtype" ):
611+ if not isinstance (y , np .ndarray ):
612+ # Since pandas 3.0, numbers and boolean values in an array
613+ # but with a carriage-return are wrongly inferred first
614+ # respectively as `object` dtype instead of `int64` and
615+ # `object` dtype instead of `bool`.
616+ # Forcing pandas to `infer_objects` fixes the error
617+ if pd .api .types .is_object_dtype (y ):
618+ y = y .infer_objects ()
619+
620+ # Since pandas 3.0 (and even in 2.0 if the option is activated)
621+ # a new `StringDtype` is used to handle strings.
622+ # It does not match any longer the one recognized by numpy.
623+ # An issue was created on scikit-learn
624+ # https://github.com/scikit-learn/scikit-learn/issues/33383
625+ # Until it is fixed, 'y' is not checked by
626+ # `_column_or_1d_with_dtype` when pandas dtype is `StringDtype`.
627+
610628 if isinstance (y .dtype , pd .CategoricalDtype ):
611629 y_checked = _column_or_1d_with_dtype (
612- y , dtype = y .dtype .categories .dtype
630+ y ,
631+ dtype = (
632+ y .dtype .categories .dtype
633+ if not pd .api .types .is_string_dtype (
634+ y .dtype .categories .dtype
635+ )
636+ else None
637+ ),
613638 )
614639 else :
615- y_checked = _column_or_1d_with_dtype (y , dtype = y .dtype )
640+ y_checked = _column_or_1d_with_dtype (
641+ y ,
642+ dtype = (
643+ y .dtype
644+ if not pd .api .types .is_string_dtype (y .dtype )
645+ else None
646+ ),
647+ )
616648 elif hasattr (y , "dtypes" ):
617649 if isinstance (y .dtypes .iloc [0 ], pd .CategoricalDtype ):
618650 y_checked = _column_or_1d_with_dtype (
619- y , dtype = y .dtypes .iloc [0 ].categories .dtype
651+ y ,
652+ dtype = (
653+ y .dtypes .iloc [0 ].categories .dtype
654+ if not pd .api .types .is_string_dtype (
655+ y .dtypes .iloc [0 ].categories .dtype
656+ )
657+ else None
658+ ),
620659 )
621660 else :
622661 y_checked = _column_or_1d_with_dtype (y )
@@ -965,21 +1004,16 @@ def __init__(self, name, dataframe, key=None):
9651004
9661005 # Initialize feature columns and verify their types
9671006 self .column_ids = self .data_source .columns .values
968- if not np .issubdtype (self .column_ids .dtype , np .integer ):
969- if np .issubdtype (self .column_ids .dtype , object ):
970- for i , column_id in enumerate (self .column_ids ):
971- if not isinstance (column_id , str ):
972- raise TypeError (
973- f"Dataframe column ids must be either all integers or "
974- f"all strings. Column id at index { i } ('{ column_id } ') is"
975- f" of type '{ type (column_id ).__name__ } '"
976- )
977- else :
978- raise TypeError (
979- f"Dataframe column ids must be either all integers or "
980- f"all strings. The column index has dtype "
981- f"'{ self .column_ids .dtype } '"
982- )
1007+ # Ensure the feature columns are either all string
1008+ # or all numeric but not a mix of both.
1009+ if not pd .api .types .is_numeric_dtype (
1010+ self .column_ids
1011+ ) and not pd .api .types .is_string_dtype (self .column_ids ):
1012+ raise TypeError (
1013+ "Dataframe column ids must be either all integers or "
1014+ "all strings. Columns have the following mixed types: "
1015+ f"{ sorted (set ([type (cid ).__name__ for cid in self .column_ids ]))} ."
1016+ )
9831017
9841018 # Initialize Khiops types
9851019 self .khiops_types = {}
@@ -988,7 +1022,11 @@ def __init__(self, name, dataframe, key=None):
9881022 column_numpy_type = column .dtype
9891023 column_max_size = None
9901024 if isinstance (column_numpy_type , pd .StringDtype ):
991- column_max_size = column .str .len ().max ()
1025+ # If a value is missing in column,
1026+ # column.str.len() would be typed as float64 instead of int64
1027+ # Until this is changed, the type is forced to int64
1028+ # cf https://github.com/pandas-dev/pandas/issues/51948
1029+ column_max_size = column .str .len ().astype (pd .Int64Dtype ()).max ()
9921030 self .khiops_types [column_id ] = get_khiops_type (
9931031 column_numpy_type , column_max_size
9941032 )
0 commit comments