Drop deprecated tuple and sequence dataset spec

popescu-v · popescu-v · commit fe8c0885678a · 2025-03-14T12:47:20.000+01:00
Only keep list-like, convertible to NumPy arrays, input observables.
diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py
@@ -21,13 +21,7 @@
 import khiops.core as kh
 import khiops.core.internals.filesystems as fs
 from khiops.core.dictionary import VariableBlock
-from khiops.core.exceptions import KhiopsRuntimeError
-from khiops.core.internals.common import (
-    deprecation_message,
-    is_dict_like,
-    is_list_like,
-    type_error_message,
-)
+from khiops.core.internals.common import is_dict_like, is_list_like, type_error_message
 
 # Disable PEP8 variable names because of scikit-learn X,y conventions
 # To capture invalid-names other than X,y run:
@@ -466,7 +460,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
             self.main_table = PandasTable("main_table", X)
             self.secondary_tables = []
         # A single numpy array (or compatible object)
-        elif hasattr(X, "__array__"):
+        elif hasattr(X, "__array__") or is_list_like(X):
             self.main_table = NumpyTable("main_table", X)
             self.secondary_tables = []
         # A scipy.sparse.spmatrix
@@ -489,57 +483,12 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
             ),
         ):
             check_array(X, accept_sparse=False)
-        # A tuple spec
-        elif isinstance(X, tuple):
-            warnings.warn(
-                deprecation_message(
-                    "Tuple dataset input",
-                    "11.0.0",
-                    replacement="dict dataset spec",
-                    quote=False,
-                ),
-                stacklevel=3,
-            )
-            # Check the input tuple
-            self._check_input_tuple(X)
-
-            # Obtain path and separator
-            path, sep = X
-
-            # Initialization
-            self.main_table = FileTable("main_table", path=path, sep=sep)
-            self.secondary_tables = []
-
-        # A dataset sequence spec
-        # We try first for compatible python arrays then the deprecated sequences spec
-        elif is_list_like(X):
-            # Try to transform to a numerical array with sklearn's check_array
-            # On failure we try the old deprecated sequence interface
-            # When the old list interface is eliminated this will considerably reduce
-            # this branch's code
-            try:
-                X_checked = check_array(X, ensure_2d=True, force_all_finite=False)
-                self.main_table = NumpyTable("main_table", X_checked)
-                self.secondary_tables = []
-            except ValueError:
-                warnings.warn(
-                    deprecation_message(
-                        "List dataset input",
-                        "11.0.0",
-                        replacement="dict dataset spec",
-                        quote=False,
-                    ),
-                    stacklevel=3,
-                )
-                self._init_tables_from_sequence(X, key=key)
         # A a dataset dict spec
         elif is_dict_like(X):
             self._init_tables_from_mapping(X)
         # Fail if X is not recognized
         else:
-            raise TypeError(
-                type_error_message("X", X, "array-like", tuple, Sequence, Mapping)
-            )
+            raise TypeError(type_error_message("X", X, "array-like", Mapping, Sequence))
 
         # Initialization of the target column if any
         if y is not None:
@@ -581,35 +530,6 @@ def _check_input_tuple(self, X):
         if not isinstance(X[1], str):
             raise TypeError(type_error_message("X[1]", X[1], str))
 
-    def _init_tables_from_sequence(self, X, key=None):
-        """Initializes the spec from a list-like 'X'"""
-        assert is_list_like(X), "'X' must be a list-like"
-
-        # Check the input sequence
-        self._check_input_sequence(X, key=key)
-
-        # Initialize the tables
-        if isinstance(X[0], pd.DataFrame):
-            self.main_table = PandasTable("main_table", X[0], key=key)
-            self.secondary_tables = []
-            for index, dataframe in enumerate(X[1:], start=1):
-                self.secondary_tables.append(
-                    PandasTable(f"secondary_table_{index:02d}", dataframe, key=key)
-                )
-        else:
-            self.main_table = FileTable("main_table", X[0], key=key)
-            self.secondary_tables = []
-            for index, table_path in enumerate(X[1:], start=1):
-                self.secondary_tables.append(
-                    FileTable(f"secondary_table_{index:02d}", table_path, key=key)
-                )
-
-        # Create a list of relations
-        main_table_name = self.main_table.name
-        self.relations = [
-            (main_table_name, table.name, False) for table in self.secondary_tables
-        ]
-
     def _check_input_sequence(self, X, key=None):
         # Check the first table
         if len(X) == 0:
@@ -1206,7 +1126,7 @@ class NumpyTable(DatasetTable):
     ----------
     name : str
         Name for the table.
-    array : `numpy.ndarray` of shape (n_samples, n_features_in)
+    array : `numpy.ndarray` of shape (n_samples, n_features_in) or Sequence
         The data frame to be encapsulated.
     key : :external:term`array-like` of int, optional
         The names of the columns composing the key.
@@ -1217,8 +1137,8 @@ def __init__(self, name, array, key=None):
         super().__init__(name, key=key)
 
         # Check the array's types and shape
-        if not hasattr(array, "__array__"):
-            raise TypeError(type_error_message("array", array, np.ndarray))
+        if not hasattr(array, "__array__") and not is_list_like(array):
+            raise TypeError(type_error_message("array", array, np.ndarray, Sequence))
 
         # Initialize the members
         self.data_source = check_array(array, ensure_2d=True, force_all_finite=False)
diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py
@@ -431,12 +431,12 @@ def assert_dataset_fails(
     # Basic X, y tests #
     ####################
 
-    def test_x_must_be_df_or_tuple_or_sequence_or_mapping(self):
+    def test_x_must_be_df_or_sequence_or_mapping(self):
         """Test that `.Dataset` init raises TypeError when X has a wrong type"""
         bad_spec = AnotherType()
         y = "class"
         expected_msg = type_error_message(
-            "X", bad_spec, "array-like", tuple, Sequence, Mapping
+            "X", bad_spec, "array-like", Mapping, Sequence
         )
         self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
 
@@ -520,13 +520,6 @@ def test_dict_spec_table_input_type_must_be_a_tuple(self):
         )
         self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
 
-    def test_dict_spec_table_input_tuple_must_have_size_2(self):
-        """Test Dataset raising ValueError when a table entry is a tuple of size != 2"""
-        bad_spec, y = self.create_fixture_dataset_spec()
-        bad_spec["tables"]["D"] = (*bad_spec["tables"]["D"], "AnotherT", "YetAnotherT")
-        expected_msg = "'D' table entry must have size 2, not 4"
-        self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg)
-
     def test_dict_spec_source_table_type_must_be_adequate(self):
         """Test Dataset raising TypeError when a table entry is not str nor DataFrame"""
         bad_spec, y = self.create_fixture_dataset_spec()
@@ -926,57 +919,3 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self):
         expected_msg_prefix = "Cannot overwrite this table's path"
         self.assertIn(expected_msg_prefix, output_error_msg)
 
-    ##########################################################
-    # Tests for tuple and sequence dataset spec (deprecated) #
-    ##########################################################
-
-    def test_tuple_spec_must_have_length_2(self):
-        """Test that `.Dataset` raises `ValueError` when the tuple is not of size 2"""
-        # Test pour la tuple de taille 3
-        bad_spec = ("a", "b", "\t")
-        y = "class"
-        self.assert_dataset_fails(
-            bad_spec, y, ValueError, "'X' tuple input must have length 2 not 3"
-        )
-
-        # Test pour une tuple de taille 1
-        bad_spec = ("a",)
-        self.assert_dataset_fails(
-            bad_spec, y, ValueError, "'X' tuple input must have length 2 not 1"
-        )
-
-    def test_tuple_spec_elements_must_be_str(self):
-        """Test Dataset raising TypeError when the tuple spec has non-strings"""
-        # Test for the first element
-        bad_spec = (AnotherType(), "/some/path")
-        y = "class"
-        expected_msg = type_error_message("X[0]", bad_spec[0], str)
-        self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
-
-        # Test for the second element
-        bad_spec = ("table-name", AnotherType())
-        expected_msg = type_error_message("X[1]", bad_spec[1], str)
-        self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
-
-    def test_sequence_spec_must_be_a_non_empty(self):
-        """Test that Datasets raises `ValueError` when X is an empty sequence"""
-        bad_spec = []
-        y = "class"
-        expected_msg = "'X' must be a non-empty sequence"
-        self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg)
-
-    def test_sequence_spec_must_be_str_or_df(self):
-        """Test Dataset raising TypeError when it is a sequence with bad types"""
-        # Test that the first element is not str or df
-        bad_spec = [AnotherType(), "table_1"]
-        y = "class"
-        expected_msg = type_error_message("X[0]", bad_spec[0], str, pd.DataFrame)
-        self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
-
-        # Test that the second element is not str
-        bad_spec = ["table_1", AnotherType()]
-        expected_msg = (
-            type_error_message("Table at index 1", bad_spec[1], str)
-            + " as the first table in X"
-        )
-        self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)