|
13 | 13 |
|
14 | 14 | import numpy as np |
15 | 15 | import pandas as pd |
16 | | -import sklearn |
17 | 16 | from scipy import sparse as sp |
18 | 17 | from sklearn.utils import check_array |
19 | 18 | from sklearn.utils.validation import column_or_1d |
|
33 | 32 | # pylint --disable=all --enable=invalid-names dataset.py |
34 | 33 | # pylint: disable=invalid-name |
35 | 34 |
|
| 35 | +# Set a special pandas option to force the new string data type (`StringDtype`) |
| 36 | +# even for version 2.0 which is still required for python 3.10. |
| 37 | +# This new string data type no longer maps to a NumPy data type. |
| 38 | +# Hence, code assuming NumPy type compatibility will break unless |
| 39 | +# this string data type is handled separately. |
| 40 | +pd.options.future.infer_string = True |
| 41 | + |
36 | 42 |
|
37 | 43 | def check_dataset_spec(ds_spec): |
38 | 44 | """Checks that a dataset spec is valid |
@@ -393,16 +399,11 @@ def write_internal_data_table(dataframe, file_path_or_stream): |
393 | 399 |
|
394 | 400 |
|
395 | 401 | def _column_or_1d_with_dtype(y, dtype=None): |
396 | | - # 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2; |
397 | | - if sklearn.__version__ < "1.2": |
398 | | - if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all(): |
399 | | - warnings.warn( |
400 | | - "'y' stores strings restricted to 'True'/'False' values: " |
401 | | - "The predict method may return a bool vector." |
402 | | - ) |
403 | | - return column_or_1d(y, warn=True) |
404 | | - else: |
405 | | - return column_or_1d(y, warn=True, dtype=dtype) |
| 402 | + """Checks the data is of the provided `dtype`. |
| 403 | + If a problem is detected a warning is printed or an error raised, |
| 404 | + otherwise the pandas object is transformed into a numpy.array |
| 405 | + """ |
| 406 | + return column_or_1d(y, warn=True, dtype=dtype) |
406 | 407 |
|
407 | 408 |
|
408 | 409 | class Dataset: |
@@ -607,16 +608,54 @@ def _init_target_column(self, y): |
607 | 608 | # pandas.Series, pandas.DataFrame or numpy.ndarray |
608 | 609 | else: |
609 | 610 | if hasattr(y, "dtype"): |
| 611 | + if not isinstance(y, np.ndarray): |
| 612 | + # Since pandas 3.0, numbers and boolean values in an array |
| 613 | + # but with a carriage-return are wrongly inferred first |
| 614 | + # respectively as `object` dtype instead of `int64` and |
| 615 | + # `object` dtype instead of `bool`. |
| 616 | + # Forcing pandas to `infer_objects` fixes the error |
| 617 | + if pd.api.types.is_object_dtype(y): |
| 618 | + y = y.infer_objects() |
| 619 | + |
| 620 | + # Since pandas 3.0 (and even in 2.0 if the option is activated) |
| 621 | + # a new `StringDtype` is used to handle strings. |
| 622 | + # It does not match any longer the one recognized by numpy. |
| 623 | + # An issue was created on scikit-learn |
| 624 | + # https://github.com/scikit-learn/scikit-learn/issues/33383 |
| 625 | + # Until it is fixed, 'y' is not checked by |
| 626 | + # `_column_or_1d_with_dtype` when pandas dtype is `StringDtype`. |
| 627 | + |
610 | 628 | if isinstance(y.dtype, pd.CategoricalDtype): |
611 | 629 | y_checked = _column_or_1d_with_dtype( |
612 | | - y, dtype=y.dtype.categories.dtype |
| 630 | + y, |
| 631 | + dtype=( |
| 632 | + y.dtype.categories.dtype |
| 633 | + if not pd.api.types.is_string_dtype( |
| 634 | + y.dtype.categories.dtype |
| 635 | + ) |
| 636 | + else None |
| 637 | + ), |
613 | 638 | ) |
614 | 639 | else: |
615 | | - y_checked = _column_or_1d_with_dtype(y, dtype=y.dtype) |
| 640 | + y_checked = _column_or_1d_with_dtype( |
| 641 | + y, |
| 642 | + dtype=( |
| 643 | + y.dtype |
| 644 | + if not pd.api.types.is_string_dtype(y.dtype) |
| 645 | + else None |
| 646 | + ), |
| 647 | + ) |
616 | 648 | elif hasattr(y, "dtypes"): |
617 | 649 | if isinstance(y.dtypes.iloc[0], pd.CategoricalDtype): |
618 | 650 | y_checked = _column_or_1d_with_dtype( |
619 | | - y, dtype=y.dtypes.iloc[0].categories.dtype |
| 651 | + y, |
| 652 | + dtype=( |
| 653 | + y.dtypes.iloc[0].categories.dtype |
| 654 | + if not pd.api.types.is_string_dtype( |
| 655 | + y.dtypes.iloc[0].categories.dtype |
| 656 | + ) |
| 657 | + else None |
| 658 | + ), |
620 | 659 | ) |
621 | 660 | else: |
622 | 661 | y_checked = _column_or_1d_with_dtype(y) |
@@ -965,21 +1004,16 @@ def __init__(self, name, dataframe, key=None): |
965 | 1004 |
|
966 | 1005 | # Initialize feature columns and verify their types |
967 | 1006 | self.column_ids = self.data_source.columns.values |
968 | | - if not np.issubdtype(self.column_ids.dtype, np.integer): |
969 | | - if np.issubdtype(self.column_ids.dtype, object): |
970 | | - for i, column_id in enumerate(self.column_ids): |
971 | | - if not isinstance(column_id, str): |
972 | | - raise TypeError( |
973 | | - f"Dataframe column ids must be either all integers or " |
974 | | - f"all strings. Column id at index {i} ('{column_id}') is" |
975 | | - f" of type '{type(column_id).__name__}'" |
976 | | - ) |
977 | | - else: |
978 | | - raise TypeError( |
979 | | - f"Dataframe column ids must be either all integers or " |
980 | | - f"all strings. The column index has dtype " |
981 | | - f"'{self.column_ids.dtype}'" |
982 | | - ) |
| 1007 | + # Ensure the feature columns are either all string |
| 1008 | + # or all numeric but not a mix of both. |
| 1009 | + if not pd.api.types.is_numeric_dtype( |
| 1010 | + self.column_ids |
| 1011 | + ) and not pd.api.types.is_string_dtype(self.column_ids): |
| 1012 | + raise TypeError( |
| 1013 | + "Dataframe column ids must be either all integers or " |
| 1014 | + "all strings. Columns have the following mixed types: " |
| 1015 | + f"{sorted(set([type(cid).__name__ for cid in self.column_ids]))}." |
| 1016 | + ) |
983 | 1017 |
|
984 | 1018 | # Initialize Khiops types |
985 | 1019 | self.khiops_types = {} |
|
0 commit comments