Skip to content

Commit 0adabd7

Browse files
committed
Drop deprecated file-based datasets
1 parent fe8c088 commit 0adabd7

8 files changed

Lines changed: 281 additions & 1888 deletions

File tree

khiops/sklearn/dataset.py

Lines changed: 20 additions & 201 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
"""Classes for handling diverse data tables"""
88
import csv
99
import io
10-
import json
11-
import warnings
1210
from abc import ABC, abstractmethod
1311
from collections.abc import Iterable, Mapping, Sequence
1412

@@ -429,11 +427,11 @@ class Dataset:
429427
430428
Parameters
431429
----------
432-
X : `pandas.DataFrame` or dict (**Deprecated types**: tuple and list)
430+
X : `pandas.DataFrame` or dict
433431
Either:
434432
- A single dataframe
435433
- A ``dict`` dataset specification
436-
y : `pandas.Series` or str, optional
434+
y : `pandas.Series`, `pandas.DataFrame` or `numpy.ndarray`, optional
437435
The target column.
438436
categorical_target : bool, default True
439437
``True`` if the vector ``y`` should be considered as a categorical variable. If
@@ -499,17 +497,6 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
499497
table.name: table for table in [self.main_table] + self.secondary_tables
500498
}
501499

502-
# Deprecation warning for file-based datasets
503-
if isinstance(self.main_table, FileTable):
504-
warnings.warn(
505-
deprecation_message(
506-
"File-based dataset spec",
507-
"11.0.0",
508-
"dataframe-based dataset or khiops.core API",
509-
quote=False,
510-
),
511-
)
512-
513500
# Post-conditions
514501
assert self.main_table is not None, "'main_table' is 'None' after init"
515502
assert isinstance(
@@ -566,39 +553,8 @@ def _init_tables_from_mapping(self, X):
566553
main_table_name = X["main_table"]
567554
main_table_source, main_table_key = X["tables"][main_table_name]
568555

569-
# Initialize a file dataset
570-
if isinstance(main_table_source, str):
571-
# Obtain the file format parameters
572-
if "format" in X:
573-
self.sep, self.header = X["format"]
574-
else:
575-
self.sep = "\t"
576-
self.header = True
577-
578-
# Initialize the tables
579-
self.main_table = FileTable(
580-
main_table_name,
581-
main_table_source,
582-
key=main_table_key,
583-
sep=self.sep,
584-
header=self.header,
585-
)
586-
self.secondary_tables = []
587-
for table_name, (table_source, table_key) in X["tables"].items():
588-
if isinstance(table_key, str):
589-
table_key = [table_key]
590-
if table_name != main_table_name:
591-
self.secondary_tables.append(
592-
FileTable(
593-
table_name,
594-
table_source,
595-
key=table_key,
596-
sep=self.sep,
597-
header=self.header,
598-
)
599-
)
600556
# Initialize a Pandas dataset
601-
elif isinstance(main_table_source, pd.DataFrame):
557+
if isinstance(main_table_source, pd.DataFrame):
602558
self.main_table = PandasTable(
603559
main_table_name,
604560
main_table_source,
@@ -619,7 +575,7 @@ def _init_tables_from_mapping(self, X):
619575
)
620576
self.secondary_tables = []
621577
# Initialize a numpyarray dataset (monotable)
622-
else:
578+
elif hasattr(main_table_source, "__array__"):
623579
self.main_table = NumpyTable(
624580
main_table_name,
625581
main_table_source,
@@ -630,6 +586,12 @@ def _init_tables_from_mapping(self, X):
630586
"with pandas dataframe source tables"
631587
)
632588
self.secondary_tables = []
589+
else:
590+
raise TypeError(
591+
type_error_message(
592+
"X's main table", main_table_source, "array-like", Mapping
593+
)
594+
)
633595

634596
# If the relations are not specified initialize to a star schema
635597
if "relations" not in X:
@@ -657,6 +619,7 @@ def _init_target_column(self, y):
657619
# - warn=True in column_or_1d is necessary to pass sklearn checks
658620
if isinstance(y, str):
659621
y_checked = y
622+
# pandas.Series, pandas.DataFrame or numpy.ndarray
660623
else:
661624
y_checked = column_or_1d(y, warn=True)
662625

@@ -674,13 +637,6 @@ def _init_target_column(self, y):
674637
type_error_message("y", y, "array-like")
675638
+ f" (X's tables are of type {type_message})"
676639
)
677-
if isinstance(self.main_table.data_source, str) and not isinstance(
678-
y_checked, str
679-
):
680-
raise TypeError(
681-
type_error_message("y", y, str)
682-
+ " (X's tables are of type str [file paths])"
683-
)
684640

685641
# Initialize the members related to the target
686642
# Case when y is a memory array
@@ -725,16 +681,6 @@ def _init_target_column(self, y):
725681
else:
726682
self.main_table.khiops_types[self.target_column_id] = "Numerical"
727683

728-
@property
729-
def is_in_memory(self):
730-
"""bool : ``True`` if the dataset is in-memory
731-
732-
A dataset is in-memory if it is constituted either of only pandas.DataFrame
733-
tables, numpy.ndarray, or scipy.sparse.spmatrix tables.
734-
"""
735-
736-
return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable))
737-
738684
@property
739685
def table_type(self):
740686
"""type : The table type of this dataset's tables
@@ -744,7 +690,6 @@ def table_type(self):
744690
- `PandasTable`
745691
- `NumpyTable`
746692
- `SparseTable`
747-
- `FileTable`
748693
"""
749694
return type(self.main_table)
750695

@@ -767,8 +712,6 @@ def to_spec(self):
767712
if self.relations:
768713
ds_spec["relations"] = []
769714
ds_spec["relations"].extend(self.relations)
770-
if self.table_type == FileTable:
771-
ds_spec["format"] = (self.sep, self.header)
772715

773716
return ds_spec
774717

@@ -815,8 +758,8 @@ def create_khiops_dictionary_domain(self):
815758
main_dictionary = self.main_table.create_khiops_dictionary()
816759
dictionary_domain.add_dictionary(main_dictionary)
817760

818-
# For in-memory datasets: Add the target variable if available
819-
if self.is_in_memory and self.target_column is not None:
761+
# Add the target variable if available
762+
if self.target_column is not None:
820763
variable = kh.Variable()
821764
variable.name = get_khiops_variable_name(self.target_column_id)
822765
if self.categorical_target:
@@ -884,20 +827,13 @@ def create_table_files_for_khiops(self, output_dir, sort=True):
884827
self.is_multitable or self.main_table.key is not None
885828
)
886829

887-
# In-memory dataset: Create the table files and add the target column
888-
if self.is_in_memory:
889-
main_table_path = self.main_table.create_table_file_for_khiops(
890-
output_dir,
891-
sort=sort_main_table,
892-
target_column=self.target_column,
893-
target_column_id=self.target_column_id,
894-
)
895-
# File dataset: Create the table files (the target column is in the file)
896-
else:
897-
main_table_path = self.main_table.create_table_file_for_khiops(
898-
output_dir,
899-
sort=sort_main_table,
900-
)
830+
# Create the table files and add the target column
831+
main_table_path = self.main_table.create_table_file_for_khiops(
832+
output_dir,
833+
sort=sort_main_table,
834+
target_column=self.target_column,
835+
target_column_id=self.target_column_id,
836+
)
901837

902838
# Create a copy of each secondary table
903839
secondary_table_paths = {}
@@ -1346,120 +1282,3 @@ def create_table_file_for_khiops(
13461282
)
13471283

13481284
return output_table_path
1349-
1350-
1351-
class FileTable(DatasetTable):
1352-
"""DatasetTable encapsulating a delimited text data file
1353-
1354-
Parameters
1355-
----------
1356-
name : str
1357-
Name for the table.
1358-
path : str
1359-
Path of the file containing the table.
1360-
key : list-like of str, optional
1361-
The names of the columns composing the key.
1362-
sep : str, optional
1363-
Field separator character. If not specified it will be inferred from the file.
1364-
header : bool, optional
1365-
Indicates if the table.
1366-
"""
1367-
1368-
def __init__(
1369-
self,
1370-
name,
1371-
path,
1372-
key=None,
1373-
sep="\t",
1374-
header=True,
1375-
):
1376-
# Initialize parameters
1377-
super().__init__(name=name, key=key)
1378-
1379-
# Check the parameters specific to this sub-class
1380-
if not isinstance(path, str):
1381-
raise TypeError(type_error_message("path", path, str))
1382-
if not fs.exists(path):
1383-
raise ValueError(f"Non-existent data table file: {path}")
1384-
1385-
# Initialize members specific to this sub-class
1386-
self.data_source = path
1387-
self.sep = sep
1388-
self.header = header
1389-
1390-
# Build a dictionary file from the input data table
1391-
# Note: We use export_dictionary_as_json instead of read_dictionary_file
1392-
# because it makes fail the sklearn mocked tests (this is technical debt)
1393-
try:
1394-
tmp_kdic_path = kh.get_runner().create_temp_file("file_table_", ".kdic")
1395-
tmp_kdicj_path = kh.get_runner().create_temp_file("file_table_", ".kdicj")
1396-
kh.build_dictionary_from_data_table(
1397-
self.data_source,
1398-
self.name,
1399-
tmp_kdic_path,
1400-
field_separator=self.sep,
1401-
header_line=header,
1402-
)
1403-
kh.export_dictionary_as_json(tmp_kdic_path, tmp_kdicj_path)
1404-
json_domain = json.loads(fs.read(tmp_kdicj_path))
1405-
finally:
1406-
fs.remove(tmp_kdic_path)
1407-
fs.remove(tmp_kdicj_path)
1408-
1409-
# Alert the user if the parsing failed
1410-
if len(json_domain["dictionaries"]) == 0:
1411-
raise KhiopsRuntimeError(
1412-
f"Failed to build a dictionary "
1413-
f"from data table file: {self.data_source}"
1414-
)
1415-
1416-
# Set the column names and types
1417-
variables = json_domain["dictionaries"][0]["variables"]
1418-
self.column_ids = [var["name"] for var in variables]
1419-
self.khiops_types = {var["name"]: var["type"] for var in variables}
1420-
1421-
# Check key integrity
1422-
self.check_key()
1423-
1424-
def create_table_file_for_khiops(self, output_dir, sort=True):
1425-
assert not sort or self.key is not None, "key is 'None'"
1426-
1427-
# Create the input and output file resources
1428-
if sort:
1429-
output_table_file_path = fs.get_child_path(
1430-
output_dir, f"sorted_{self.name}.txt"
1431-
)
1432-
else:
1433-
output_table_file_path = fs.get_child_path(
1434-
output_dir, f"copy_{self.name}.txt"
1435-
)
1436-
1437-
# Fail if they have the same path
1438-
if output_table_file_path == self.data_source:
1439-
raise ValueError(f"Cannot overwrite this table's path: {self.data_source}")
1440-
1441-
# Create a sorted copy if requested
1442-
if sort:
1443-
# Create the sorting dictionary domain
1444-
sort_dictionary_domain = kh.DictionaryDomain()
1445-
sort_dictionary_domain.add_dictionary(self.create_khiops_dictionary())
1446-
1447-
# Delegate the sorting and copy to khiops.core.sort_data_table
1448-
# We use the same input format of the original table
1449-
kh.sort_data_table(
1450-
sort_dictionary_domain,
1451-
self.name,
1452-
self.data_source,
1453-
output_table_file_path,
1454-
self.key,
1455-
field_separator=self.sep,
1456-
header_line=self.header,
1457-
output_field_separator=self.sep,
1458-
output_header_line=self.header,
1459-
)
1460-
1461-
# Otherwise copy the contents to the output file
1462-
else:
1463-
fs.write(output_table_file_path, fs.read(self.data_source))
1464-
1465-
return output_table_file_path

0 commit comments

Comments
 (0)