Skip to content

Commit fe8c088

Browse files
committed
Drop deprecated tuple and sequence dataset spec
Only keep list-like, convertible to NumPy arrays, input observables.
1 parent 60dee53 commit fe8c088

2 files changed

Lines changed: 8 additions & 149 deletions

File tree

khiops/sklearn/dataset.py

Lines changed: 6 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,7 @@
2121
import khiops.core as kh
2222
import khiops.core.internals.filesystems as fs
2323
from khiops.core.dictionary import VariableBlock
24-
from khiops.core.exceptions import KhiopsRuntimeError
25-
from khiops.core.internals.common import (
26-
deprecation_message,
27-
is_dict_like,
28-
is_list_like,
29-
type_error_message,
30-
)
24+
from khiops.core.internals.common import is_dict_like, is_list_like, type_error_message
3125

3226
# Disable PEP8 variable names because of scikit-learn X,y conventions
3327
# To capture invalid-names other than X,y run:
@@ -466,7 +460,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
466460
self.main_table = PandasTable("main_table", X)
467461
self.secondary_tables = []
468462
# A single numpy array (or compatible object)
469-
elif hasattr(X, "__array__"):
463+
elif hasattr(X, "__array__") or is_list_like(X):
470464
self.main_table = NumpyTable("main_table", X)
471465
self.secondary_tables = []
472466
# A scipy.sparse.spmatrix
@@ -489,57 +483,12 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
489483
),
490484
):
491485
check_array(X, accept_sparse=False)
492-
# A tuple spec
493-
elif isinstance(X, tuple):
494-
warnings.warn(
495-
deprecation_message(
496-
"Tuple dataset input",
497-
"11.0.0",
498-
replacement="dict dataset spec",
499-
quote=False,
500-
),
501-
stacklevel=3,
502-
)
503-
# Check the input tuple
504-
self._check_input_tuple(X)
505-
506-
# Obtain path and separator
507-
path, sep = X
508-
509-
# Initialization
510-
self.main_table = FileTable("main_table", path=path, sep=sep)
511-
self.secondary_tables = []
512-
513-
# A dataset sequence spec
514-
# We try first for compatible python arrays then the deprecated sequences spec
515-
elif is_list_like(X):
516-
# Try to transform to a numerical array with sklearn's check_array
517-
# On failure we try the old deprecated sequence interface
518-
# When the old list interface is eliminated this will considerably reduce
519-
# this branch's code
520-
try:
521-
X_checked = check_array(X, ensure_2d=True, force_all_finite=False)
522-
self.main_table = NumpyTable("main_table", X_checked)
523-
self.secondary_tables = []
524-
except ValueError:
525-
warnings.warn(
526-
deprecation_message(
527-
"List dataset input",
528-
"11.0.0",
529-
replacement="dict dataset spec",
530-
quote=False,
531-
),
532-
stacklevel=3,
533-
)
534-
self._init_tables_from_sequence(X, key=key)
535486
# A a dataset dict spec
536487
elif is_dict_like(X):
537488
self._init_tables_from_mapping(X)
538489
# Fail if X is not recognized
539490
else:
540-
raise TypeError(
541-
type_error_message("X", X, "array-like", tuple, Sequence, Mapping)
542-
)
491+
raise TypeError(type_error_message("X", X, "array-like", Mapping, Sequence))
543492

544493
# Initialization of the target column if any
545494
if y is not None:
@@ -581,35 +530,6 @@ def _check_input_tuple(self, X):
581530
if not isinstance(X[1], str):
582531
raise TypeError(type_error_message("X[1]", X[1], str))
583532

584-
def _init_tables_from_sequence(self, X, key=None):
585-
"""Initializes the spec from a list-like 'X'"""
586-
assert is_list_like(X), "'X' must be a list-like"
587-
588-
# Check the input sequence
589-
self._check_input_sequence(X, key=key)
590-
591-
# Initialize the tables
592-
if isinstance(X[0], pd.DataFrame):
593-
self.main_table = PandasTable("main_table", X[0], key=key)
594-
self.secondary_tables = []
595-
for index, dataframe in enumerate(X[1:], start=1):
596-
self.secondary_tables.append(
597-
PandasTable(f"secondary_table_{index:02d}", dataframe, key=key)
598-
)
599-
else:
600-
self.main_table = FileTable("main_table", X[0], key=key)
601-
self.secondary_tables = []
602-
for index, table_path in enumerate(X[1:], start=1):
603-
self.secondary_tables.append(
604-
FileTable(f"secondary_table_{index:02d}", table_path, key=key)
605-
)
606-
607-
# Create a list of relations
608-
main_table_name = self.main_table.name
609-
self.relations = [
610-
(main_table_name, table.name, False) for table in self.secondary_tables
611-
]
612-
613533
def _check_input_sequence(self, X, key=None):
614534
# Check the first table
615535
if len(X) == 0:
@@ -1206,7 +1126,7 @@ class NumpyTable(DatasetTable):
12061126
----------
12071127
name : str
12081128
Name for the table.
1209-
array : `numpy.ndarray` of shape (n_samples, n_features_in)
1129+
array : `numpy.ndarray` of shape (n_samples, n_features_in) or Sequence
12101130
The data frame to be encapsulated.
12111131
key : :external:term`array-like` of int, optional
12121132
The names of the columns composing the key.
@@ -1217,8 +1137,8 @@ def __init__(self, name, array, key=None):
12171137
super().__init__(name, key=key)
12181138

12191139
# Check the array's types and shape
1220-
if not hasattr(array, "__array__"):
1221-
raise TypeError(type_error_message("array", array, np.ndarray))
1140+
if not hasattr(array, "__array__") and not is_list_like(array):
1141+
raise TypeError(type_error_message("array", array, np.ndarray, Sequence))
12221142

12231143
# Initialize the members
12241144
self.data_source = check_array(array, ensure_2d=True, force_all_finite=False)

tests/test_dataset_errors.py

Lines changed: 2 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -431,12 +431,12 @@ def assert_dataset_fails(
431431
# Basic X, y tests #
432432
####################
433433

434-
def test_x_must_be_df_or_tuple_or_sequence_or_mapping(self):
434+
def test_x_must_be_df_or_sequence_or_mapping(self):
435435
"""Test that `.Dataset` init raises TypeError when X has a wrong type"""
436436
bad_spec = AnotherType()
437437
y = "class"
438438
expected_msg = type_error_message(
439-
"X", bad_spec, "array-like", tuple, Sequence, Mapping
439+
"X", bad_spec, "array-like", Mapping, Sequence
440440
)
441441
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
442442

@@ -520,13 +520,6 @@ def test_dict_spec_table_input_type_must_be_a_tuple(self):
520520
)
521521
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
522522

523-
def test_dict_spec_table_input_tuple_must_have_size_2(self):
524-
"""Test Dataset raising ValueError when a table entry is a tuple of size != 2"""
525-
bad_spec, y = self.create_fixture_dataset_spec()
526-
bad_spec["tables"]["D"] = (*bad_spec["tables"]["D"], "AnotherT", "YetAnotherT")
527-
expected_msg = "'D' table entry must have size 2, not 4"
528-
self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg)
529-
530523
def test_dict_spec_source_table_type_must_be_adequate(self):
531524
"""Test Dataset raising TypeError when a table entry is not str nor DataFrame"""
532525
bad_spec, y = self.create_fixture_dataset_spec()
@@ -926,57 +919,3 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self):
926919
expected_msg_prefix = "Cannot overwrite this table's path"
927920
self.assertIn(expected_msg_prefix, output_error_msg)
928921

929-
##########################################################
930-
# Tests for tuple and sequence dataset spec (deprecated) #
931-
##########################################################
932-
933-
def test_tuple_spec_must_have_length_2(self):
934-
"""Test that `.Dataset` raises `ValueError` when the tuple is not of size 2"""
935-
# Test pour la tuple de taille 3
936-
bad_spec = ("a", "b", "\t")
937-
y = "class"
938-
self.assert_dataset_fails(
939-
bad_spec, y, ValueError, "'X' tuple input must have length 2 not 3"
940-
)
941-
942-
# Test pour une tuple de taille 1
943-
bad_spec = ("a",)
944-
self.assert_dataset_fails(
945-
bad_spec, y, ValueError, "'X' tuple input must have length 2 not 1"
946-
)
947-
948-
def test_tuple_spec_elements_must_be_str(self):
949-
"""Test Dataset raising TypeError when the tuple spec has non-strings"""
950-
# Test for the first element
951-
bad_spec = (AnotherType(), "/some/path")
952-
y = "class"
953-
expected_msg = type_error_message("X[0]", bad_spec[0], str)
954-
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
955-
956-
# Test for the second element
957-
bad_spec = ("table-name", AnotherType())
958-
expected_msg = type_error_message("X[1]", bad_spec[1], str)
959-
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
960-
961-
def test_sequence_spec_must_be_a_non_empty(self):
962-
"""Test that Datasets raises `ValueError` when X is an empty sequence"""
963-
bad_spec = []
964-
y = "class"
965-
expected_msg = "'X' must be a non-empty sequence"
966-
self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg)
967-
968-
def test_sequence_spec_must_be_str_or_df(self):
969-
"""Test Dataset raising TypeError when it is a sequence with bad types"""
970-
# Test that the first element is not str or df
971-
bad_spec = [AnotherType(), "table_1"]
972-
y = "class"
973-
expected_msg = type_error_message("X[0]", bad_spec[0], str, pd.DataFrame)
974-
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
975-
976-
# Test that the second element is not str
977-
bad_spec = ["table_1", AnotherType()]
978-
expected_msg = (
979-
type_error_message("Table at index 1", bad_spec[1], str)
980-
+ " as the first table in X"
981-
)
982-
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)

0 commit comments

Comments
 (0)