77"""Classes for handling diverse data tables"""
88import csv
99import io
10- import json
11- import warnings
1210from abc import ABC , abstractmethod
1311from collections .abc import Iterable , Mapping , Sequence
1412
@@ -429,11 +427,11 @@ class Dataset:
429427
430428 Parameters
431429 ----------
432- X : `pandas.DataFrame` or dict (**Deprecated types**: tuple and list)
430+ X : `pandas.DataFrame` or dict
433431 Either:
434432 - A single dataframe
435433 - A ``dict`` dataset specification
436- y : `pandas.Series` or str , optional
434+ y : `pandas.Series`, `pandas.DataFrame` or `numpy.ndarray` , optional
437435 The target column.
438436 categorical_target : bool, default True
439437 ``True`` if the vector ``y`` should be considered as a categorical variable. If
@@ -499,17 +497,6 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
499497 table .name : table for table in [self .main_table ] + self .secondary_tables
500498 }
501499
502- # Deprecation warning for file-based datasets
503- if isinstance (self .main_table , FileTable ):
504- warnings .warn (
505- deprecation_message (
506- "File-based dataset spec" ,
507- "11.0.0" ,
508- "dataframe-based dataset or khiops.core API" ,
509- quote = False ,
510- ),
511- )
512-
513500 # Post-conditions
514501 assert self .main_table is not None , "'main_table' is 'None' after init"
515502 assert isinstance (
@@ -566,39 +553,8 @@ def _init_tables_from_mapping(self, X):
566553 main_table_name = X ["main_table" ]
567554 main_table_source , main_table_key = X ["tables" ][main_table_name ]
568555
569- # Initialize a file dataset
570- if isinstance (main_table_source , str ):
571- # Obtain the file format parameters
572- if "format" in X :
573- self .sep , self .header = X ["format" ]
574- else :
575- self .sep = "\t "
576- self .header = True
577-
578- # Initialize the tables
579- self .main_table = FileTable (
580- main_table_name ,
581- main_table_source ,
582- key = main_table_key ,
583- sep = self .sep ,
584- header = self .header ,
585- )
586- self .secondary_tables = []
587- for table_name , (table_source , table_key ) in X ["tables" ].items ():
588- if isinstance (table_key , str ):
589- table_key = [table_key ]
590- if table_name != main_table_name :
591- self .secondary_tables .append (
592- FileTable (
593- table_name ,
594- table_source ,
595- key = table_key ,
596- sep = self .sep ,
597- header = self .header ,
598- )
599- )
600556 # Initialize a Pandas dataset
601- elif isinstance (main_table_source , pd .DataFrame ):
557+ if isinstance (main_table_source , pd .DataFrame ):
602558 self .main_table = PandasTable (
603559 main_table_name ,
604560 main_table_source ,
@@ -619,7 +575,7 @@ def _init_tables_from_mapping(self, X):
619575 )
620576 self .secondary_tables = []
621577 # Initialize a numpyarray dataset (monotable)
622- else :
578+ elif hasattr ( main_table_source , "__array__" ) :
623579 self .main_table = NumpyTable (
624580 main_table_name ,
625581 main_table_source ,
@@ -630,6 +586,12 @@ def _init_tables_from_mapping(self, X):
630586 "with pandas dataframe source tables"
631587 )
632588 self .secondary_tables = []
589+ else :
590+ raise TypeError (
591+ type_error_message (
592+ "X's main table" , main_table_source , "array-like" , Mapping
593+ )
594+ )
633595
634596 # If the relations are not specified initialize to a star schema
635597 if "relations" not in X :
@@ -657,6 +619,7 @@ def _init_target_column(self, y):
657619 # - warn=True in column_or_1d is necessary to pass sklearn checks
658620 if isinstance (y , str ):
659621 y_checked = y
622+ # pandas.Series, pandas.DataFrame or numpy.ndarray
660623 else :
661624 y_checked = column_or_1d (y , warn = True )
662625
@@ -674,13 +637,6 @@ def _init_target_column(self, y):
674637 type_error_message ("y" , y , "array-like" )
675638 + f" (X's tables are of type { type_message } )"
676639 )
677- if isinstance (self .main_table .data_source , str ) and not isinstance (
678- y_checked , str
679- ):
680- raise TypeError (
681- type_error_message ("y" , y , str )
682- + " (X's tables are of type str [file paths])"
683- )
684640
685641 # Initialize the members related to the target
686642 # Case when y is a memory array
@@ -725,16 +681,6 @@ def _init_target_column(self, y):
725681 else :
726682 self .main_table .khiops_types [self .target_column_id ] = "Numerical"
727683
728- @property
729- def is_in_memory (self ):
730- """bool : ``True`` if the dataset is in-memory
731-
732- A dataset is in-memory if it is constituted either of only pandas.DataFrame
733- tables, numpy.ndarray, or scipy.sparse.spmatrix tables.
734- """
735-
736- return isinstance (self .main_table , (PandasTable , NumpyTable , SparseTable ))
737-
738684 @property
739685 def table_type (self ):
740686 """type : The table type of this dataset's tables
@@ -744,7 +690,6 @@ def table_type(self):
744690 - `PandasTable`
745691 - `NumpyTable`
746692 - `SparseTable`
747- - `FileTable`
748693 """
749694 return type (self .main_table )
750695
@@ -767,8 +712,6 @@ def to_spec(self):
767712 if self .relations :
768713 ds_spec ["relations" ] = []
769714 ds_spec ["relations" ].extend (self .relations )
770- if self .table_type == FileTable :
771- ds_spec ["format" ] = (self .sep , self .header )
772715
773716 return ds_spec
774717
@@ -815,8 +758,8 @@ def create_khiops_dictionary_domain(self):
815758 main_dictionary = self .main_table .create_khiops_dictionary ()
816759 dictionary_domain .add_dictionary (main_dictionary )
817760
818- # For in-memory datasets: Add the target variable if available
819- if self .is_in_memory and self . target_column is not None :
761+ # Add the target variable if available
762+ if self .target_column is not None :
820763 variable = kh .Variable ()
821764 variable .name = get_khiops_variable_name (self .target_column_id )
822765 if self .categorical_target :
@@ -884,20 +827,13 @@ def create_table_files_for_khiops(self, output_dir, sort=True):
884827 self .is_multitable or self .main_table .key is not None
885828 )
886829
887- # In-memory dataset: Create the table files and add the target column
888- if self .is_in_memory :
889- main_table_path = self .main_table .create_table_file_for_khiops (
890- output_dir ,
891- sort = sort_main_table ,
892- target_column = self .target_column ,
893- target_column_id = self .target_column_id ,
894- )
895- # File dataset: Create the table files (the target column is in the file)
896- else :
897- main_table_path = self .main_table .create_table_file_for_khiops (
898- output_dir ,
899- sort = sort_main_table ,
900- )
830+ # Create the table files and add the target column
831+ main_table_path = self .main_table .create_table_file_for_khiops (
832+ output_dir ,
833+ sort = sort_main_table ,
834+ target_column = self .target_column ,
835+ target_column_id = self .target_column_id ,
836+ )
901837
902838 # Create a copy of each secondary table
903839 secondary_table_paths = {}
@@ -1346,120 +1282,3 @@ def create_table_file_for_khiops(
13461282 )
13471283
13481284 return output_table_path
1349-
1350-
1351- class FileTable (DatasetTable ):
1352- """DatasetTable encapsulating a delimited text data file
1353-
1354- Parameters
1355- ----------
1356- name : str
1357- Name for the table.
1358- path : str
1359- Path of the file containing the table.
1360- key : list-like of str, optional
1361- The names of the columns composing the key.
1362- sep : str, optional
1363- Field separator character. If not specified it will be inferred from the file.
1364- header : bool, optional
1365- Indicates if the table.
1366- """
1367-
1368- def __init__ (
1369- self ,
1370- name ,
1371- path ,
1372- key = None ,
1373- sep = "\t " ,
1374- header = True ,
1375- ):
1376- # Initialize parameters
1377- super ().__init__ (name = name , key = key )
1378-
1379- # Check the parameters specific to this sub-class
1380- if not isinstance (path , str ):
1381- raise TypeError (type_error_message ("path" , path , str ))
1382- if not fs .exists (path ):
1383- raise ValueError (f"Non-existent data table file: { path } " )
1384-
1385- # Initialize members specific to this sub-class
1386- self .data_source = path
1387- self .sep = sep
1388- self .header = header
1389-
1390- # Build a dictionary file from the input data table
1391- # Note: We use export_dictionary_as_json instead of read_dictionary_file
1392- # because it makes fail the sklearn mocked tests (this is technical debt)
1393- try :
1394- tmp_kdic_path = kh .get_runner ().create_temp_file ("file_table_" , ".kdic" )
1395- tmp_kdicj_path = kh .get_runner ().create_temp_file ("file_table_" , ".kdicj" )
1396- kh .build_dictionary_from_data_table (
1397- self .data_source ,
1398- self .name ,
1399- tmp_kdic_path ,
1400- field_separator = self .sep ,
1401- header_line = header ,
1402- )
1403- kh .export_dictionary_as_json (tmp_kdic_path , tmp_kdicj_path )
1404- json_domain = json .loads (fs .read (tmp_kdicj_path ))
1405- finally :
1406- fs .remove (tmp_kdic_path )
1407- fs .remove (tmp_kdicj_path )
1408-
1409- # Alert the user if the parsing failed
1410- if len (json_domain ["dictionaries" ]) == 0 :
1411- raise KhiopsRuntimeError (
1412- f"Failed to build a dictionary "
1413- f"from data table file: { self .data_source } "
1414- )
1415-
1416- # Set the column names and types
1417- variables = json_domain ["dictionaries" ][0 ]["variables" ]
1418- self .column_ids = [var ["name" ] for var in variables ]
1419- self .khiops_types = {var ["name" ]: var ["type" ] for var in variables }
1420-
1421- # Check key integrity
1422- self .check_key ()
1423-
1424- def create_table_file_for_khiops (self , output_dir , sort = True ):
1425- assert not sort or self .key is not None , "key is 'None'"
1426-
1427- # Create the input and output file resources
1428- if sort :
1429- output_table_file_path = fs .get_child_path (
1430- output_dir , f"sorted_{ self .name } .txt"
1431- )
1432- else :
1433- output_table_file_path = fs .get_child_path (
1434- output_dir , f"copy_{ self .name } .txt"
1435- )
1436-
1437- # Fail if they have the same path
1438- if output_table_file_path == self .data_source :
1439- raise ValueError (f"Cannot overwrite this table's path: { self .data_source } " )
1440-
1441- # Create a sorted copy if requested
1442- if sort :
1443- # Create the sorting dictionary domain
1444- sort_dictionary_domain = kh .DictionaryDomain ()
1445- sort_dictionary_domain .add_dictionary (self .create_khiops_dictionary ())
1446-
1447- # Delegate the sorting and copy to khiops.core.sort_data_table
1448- # We use the same input format of the original table
1449- kh .sort_data_table (
1450- sort_dictionary_domain ,
1451- self .name ,
1452- self .data_source ,
1453- output_table_file_path ,
1454- self .key ,
1455- field_separator = self .sep ,
1456- header_line = self .header ,
1457- output_field_separator = self .sep ,
1458- output_header_line = self .header ,
1459- )
1460-
1461- # Otherwise copy the contents to the output file
1462- else :
1463- fs .write (output_table_file_path , fs .read (self .data_source ))
1464-
1465- return output_table_file_path
0 commit comments