diff --git a/CHANGELOG.md b/CHANGELOG.md index 9064cefd..9dd925d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ ### Added - (`core`) Dictionary API support for dictionary, variable and variable block comments, and dictionary and variable block internal comments. +- (`sklearn`) `Text` Khiops type support at the estimator level. ### Fixed - (General) Inconsistency between the `tools.download_datasets` function and the diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index b2ace60b..ce8611d9 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -148,6 +148,72 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_text +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the dataset into a pandas dataframe + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + data_df = pd.read_csv(data_table_path, sep="\t") + + # Split the whole dataframe into train and test (70%-30%) + data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("negativereason" column) + X_train = data_train_df.drop("negativereason", axis=1) + X_test = data_test_df.drop("negativereason", axis=1) + y_train = data_train_df["negativereason"] + y_test = data_test_df["negativereason"] + + # Set Pandas StringDType on the "text" column + X_train["text"] = X_train["text"].astype("string") + X_test["text"] = X_test["text"].astype("string") + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate the accuracy metric on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + print(f"Test accuracy = {test_accuracy}") + + # If you have Khiops Visualization installed you may open the report as follows + # khc.export_report_file("report.khj") + # kh.visualize_report("report.khj") .. autofunction:: khiops_classifier_multitable_star .. code-block:: python diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index c4b107fe..6fa1fce2 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -149,6 +149,85 @@ "print(f\"Test auc = {test_auc}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_text()`\n\n", + "Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the dataset into a pandas dataframe\n", + "data_table_path = os.path.join(\n", + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n", + ")\n", + "data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n", + "\n", + "# Split the whole dataframe into train and test (70%-30%)\n", + "data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n", + "\n", + "# Split the dataset into:\n", + "# - the X feature table\n", + "# - the y target vector (\"negativereason\" column)\n", + "X_train = data_train_df.drop(\"negativereason\", axis=1)\n", + "X_test = data_test_df.drop(\"negativereason\", axis=1)\n", + "y_train = data_train_df[\"negativereason\"]\n", + "y_test = data_test_df[\"negativereason\"]\n", + "\n", + "# Set Pandas StringDType on the \"text\" column\n", + "X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n", + "X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n", + "\n", + "# Create the classifier object\n", + "khc = KhiopsClassifier()\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Show the feature importance info\n", + "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", + "print(f\"Features selected : {khc.n_features_used_}\")\n", + "print(\"Top 3 used features\")\n", + "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + "print(\"---\")\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[0:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[0:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate the accuracy metric on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "\n", + "# If you have Khiops Visualization installed you may open the report as follows\n", + "# khc.export_report_file(\"report.khj\")\n", + "# kh.visualize_report(\"report.khj\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index d473be53..3ba1d320 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -142,6 +142,75 @@ def khiops_classifier_multiclass(): print(f"Test auc = {test_auc}") +def khiops_classifier_text(): + """Train a `.KhiopsClassifier` on a monotable dataframe with textual data""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the dataset into a pandas dataframe + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + data_df = pd.read_csv(data_table_path, sep="\t") + + # Split the whole dataframe into train and test (70%-30%) + data_train_df, data_test_df = train_test_split( + data_df, test_size=0.3, random_state=1 + ) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("negativereason" column) + X_train = data_train_df.drop("negativereason", axis=1) + X_test = data_test_df.drop("negativereason", axis=1) + y_train = data_train_df["negativereason"] + y_test = data_test_df["negativereason"] + + # Set Pandas StringDType on the "text" column + X_train["text"] = X_train["text"].astype("string") + X_test["text"] = X_test["text"].astype("string") + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate the accuracy metric on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + print(f"Test accuracy = {test_accuracy}") + + # If you have Khiops Visualization installed you may open the report as follows + # khc.export_report_file("report.khj") + # kh.visualize_report("report.khj") + + def khiops_classifier_multitable_star(): """Trains a `.KhiopsClassifier` on a star multi-table dataset""" # Imports @@ -831,6 +900,7 @@ def khiops_coclustering_simplify(): exported_samples = [ khiops_classifier, khiops_classifier_multiclass, + khiops_classifier_text, khiops_classifier_multitable_star, khiops_classifier_multitable_snowflake, khiops_classifier_sparse, diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 08ccb8f1..03603917 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -236,19 +236,40 @@ def _upgrade_mapping_spec(ds_spec): return new_ds_spec -def get_khiops_type(numpy_type): +def get_khiops_type(numpy_type, categorical_str_max_size=None): """Translates a numpy dtype to a Khiops dictionary type Parameters ---------- - numpy_type : `numpy.dtype`: + numpy_type : `numpy.dtype` Numpy type of the column + categorical_str_max_size : `int`, optional + Maximum length of the entries of the column whose type is ``numpy_type``. Returns ------- str - Khiops type name. Either "Categorical", "Numerical" or "Timestamp" + Khiops type name. Either "Categorical", "Text", "Numerical" or "Timestamp". + + .. note:: + The "Text" Khiops type is inferred if the Numpy type is "string" + and the maximum length of the entries of that type is greater than 100. + """ + # Check categorical_str_max_size type + if categorical_str_max_size is not None and not isinstance( + categorical_str_max_size, (int, np.int64) + ): + raise TypeError( + type_error_message( + "categorical_str_max_size", + categorical_str_max_size, + int, + np.int64, + ) + ) + + # Get the Numpy dtype in lowercase lower_numpy_type = str(numpy_type).lower() # timedelta64 and datetime64 types @@ -257,6 +278,11 @@ def get_khiops_type(numpy_type): # float, int, uint types elif "int" in lower_numpy_type or "float" in lower_numpy_type: khiops_type = "Numerical" + elif lower_numpy_type == "string": + if categorical_str_max_size is not None and categorical_str_max_size > 100: + khiops_type = "Text" + else: + khiops_type = "Categorical" # bool_ and object, character, bytes_, str_, void, record and other types else: khiops_type = "Categorical" @@ -956,10 +982,16 @@ def __init__(self, name, dataframe, key=None): ) # Initialize Khiops types - self.khiops_types = { - column_id: get_khiops_type(self.data_source.dtypes[column_id]) - for column_id in self.column_ids - } + self.khiops_types = {} + for column_id in self.column_ids: + column = self.data_source[column_id] + column_numpy_type = column.dtype + column_max_size = None + if isinstance(column_numpy_type, pd.StringDtype): + column_max_size = column.str.len().max() + self.khiops_types[column_id] = get_khiops_type( + column_numpy_type, column_max_size + ) # Check key integrity self.check_key() diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index bf0435c9..3cfcf13a 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1199,6 +1199,8 @@ def __init__( self, n_features=100, n_trees=10, + n_text_features=10000, + type_text_features="words", specific_pairs=None, all_possible_pairs=True, construction_rules=None, @@ -1213,6 +1215,8 @@ def __init__( ) self.n_features = n_features self.n_trees = n_trees + self.n_text_features = n_text_features + self.type_text_features = type_text_features self.specific_pairs = specific_pairs self.all_possible_pairs = all_possible_pairs self.construction_rules = construction_rules @@ -1280,6 +1284,20 @@ def _fit_check_params(self, ds, **kwargs): raise TypeError(type_error_message("n_trees", self.n_trees, int)) if self.n_trees < 0: raise ValueError("'n_trees' must be positive") + if not isinstance(self.n_text_features, int): + raise TypeError( + type_error_message("n_text_features", self.n_text_features, int) + ) + if self.n_text_features < 0: + raise ValueError("'n_text_features' must be positive") + if not isinstance(self.type_text_features, str): + raise TypeError( + type_error_message("type_text_features", self.type_text_features, str) + ) + if self.type_text_features not in ("words", "ngrams", "tokens"): + raise ValueError( + "'type_text_features' must be among 'words', 'ngrams' or 'tokens'" + ) if self.construction_rules is not None: if not is_list_like(self.construction_rules): raise TypeError( @@ -1373,6 +1391,8 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Rename parameters to be compatible with khiops.core kwargs["max_constructed_variables"] = kwargs.pop("n_features") kwargs["max_trees"] = kwargs.pop("n_trees") + kwargs["max_text_features"] = kwargs.pop("n_text_features") + kwargs["text_features"] = kwargs.pop("type_text_features") # Add the additional_data_tables parameter kwargs["additional_data_tables"] = additional_data_tables @@ -1548,6 +1568,8 @@ def __init__( self, n_features=100, n_trees=10, + n_text_features=10000, + type_text_features="words", n_selected_features=0, n_evaluated_features=0, specific_pairs=None, @@ -1560,6 +1582,8 @@ def __init__( super().__init__( n_features=n_features, n_trees=n_trees, + n_text_features=n_text_features, + type_text_features=type_text_features, specific_pairs=specific_pairs, all_possible_pairs=all_possible_pairs, construction_rules=construction_rules, @@ -1722,6 +1746,13 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor): combine other features, either native or constructed. These features usually improve the classifier's performance at the cost of interpretability of the model. + n_text_features : int, default 10000 + Maximum number of text features to construct. + type_text_features : str, default "words" + Type of the text features to construct. Can be either one of: + - "words": sequences of non-space characters + - "ngrams": sequences of bytes + - "tokens": user-defined n_selected_features : int, default 0 Maximum number of features to be selected in the SNB predictor. If equal to 0 it selects all the features kept in the training. @@ -1813,6 +1844,8 @@ def __init__( n_features=100, n_pairs=0, n_trees=10, + n_text_features=10000, + type_text_features="words", n_selected_features=0, n_evaluated_features=0, specific_pairs=None, @@ -1826,6 +1859,8 @@ def __init__( super().__init__( n_features=n_features, n_trees=n_trees, + n_text_features=n_text_features, + type_text_features=type_text_features, n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, construction_rules=construction_rules, @@ -2217,6 +2252,8 @@ def __init__( self, n_features=100, n_trees=0, + n_text_features=10000, + type_text_features="words", n_selected_features=0, n_evaluated_features=0, construction_rules=None, @@ -2227,6 +2264,8 @@ def __init__( super().__init__( n_features=n_features, n_trees=n_trees, + n_text_features=n_text_features, + type_text_features=type_text_features, n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, construction_rules=construction_rules, @@ -2376,6 +2415,13 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator): Maximum number of decision tree features to construct. The constructed trees combine other features, either native or constructed. These features usually improve a predictor's performance at the cost of interpretability of the model. + n_text_features : int, default 10000 + Maximum number of text features to construct. + type_text_features : str, default "words" + Type of the text features to construct. Can be either one of: + - "words": sequences of non-space characters + - "ngrams": sequences of bytes + - "tokens": user-defined specific_pairs : list of tuple, optional User-specified pairs as a list of 2-tuples of feature names. If a given tuple contains only one non-empty feature name, then it generates all the pairs @@ -2469,6 +2515,8 @@ def __init__( n_features=100, n_pairs=0, n_trees=0, + n_text_features=10000, + type_text_features="words", specific_pairs=None, all_possible_pairs=True, construction_rules=None, @@ -2485,6 +2533,8 @@ def __init__( super().__init__( n_features=n_features, n_trees=n_trees, + n_text_features=n_text_features, + type_text_features=type_text_features, construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 23753852..b2f2b35c 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -100,15 +100,17 @@ def create_monotable_dataframe(self): False, False, ], + # Make some entries longer than 100 characters in the "Title" column + # to trigger the "Text" Khiops type assignment heuristic "Title": [ - "Awesome", + (15 * "Awesome ").strip(), "Very lovely", "Some major design flaws", "My favorite buy!", - "Flattering shirt", + (7 * "Flattering shirt ").strip(), "Not for the very petite", "Cagrcoal shimmer fun", - "Shimmer, surprisingly goes with lots", + (3 * "Shimmer, surprisingly goes with lots ").strip(), "Flattering", "Such a fun dress!", ], @@ -128,6 +130,9 @@ def create_monotable_dataframe(self): ], } dataset = pd.DataFrame(data) + + # Force StringDType on "Title" to have it of "Text" Khiops type + dataset["Title"] = dataset["Title"].astype("string") return dataset def create_monotable_data_file(self, table_path): @@ -294,7 +299,7 @@ def get_ref_var_types(self, multitable, schema=None): "Clothing ID": "Numerical", "Date": "Timestamp", "New": "Categorical", - "Title": "Categorical", + "Title": "Text", "Recommended IND": "Numerical", "Positive Feedback average": "Numerical", "class": "Categorical", @@ -489,7 +494,9 @@ def test_out_file_from_dataframe_monotable(self): # Create and load the intermediary Khiops file out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - out_table = pd.read_csv(out_table_path, sep="\t") + + # Force StringDType on the "Title" column upon CSV reading + out_table = pd.read_csv(out_table_path, sep="\t", dtype={"Title": "string"}) # Cast "Date" columns to datetime as we don't automatically recognize dates out_table["Date"] = out_table["Date"].astype("datetime64[ns]") @@ -799,7 +806,7 @@ def create_monotable_dataset_with_newlines(): ], "Age": [39], "Title": [ - "Shimmer,\nsurprisingly\n\rgoes with lots", + (3 * "Shimmer,\nsurprisingly\n\rgoes with lots").strip(), ], } dataset = pd.DataFrame(data) @@ -814,7 +821,7 @@ def test_newlines_removed_from_csv_file_for_khiops(self): out_table = pd.read_csv(out_table_path, sep="\t") self.assertEqual( - "Shimmer, surprisingly goes with lots", + (3 * "Shimmer, surprisingly goes with lots").strip(), out_table.Title[0], "Newlines should have been removed from the data", ) diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index 958cb50e..ac4a46c6 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -187,7 +187,7 @@ def test_classifier_attributes_multitable(self): by Khiops post training. """ X, y = self._create_multitable_input() - khc_accidents = KhiopsClassifier(n_trees=0, n_pairs=10) + khc_accidents = KhiopsClassifier(n_trees=0, n_text_features=0, n_pairs=10) khc_accidents.fit(X, y) self.assert_attribute_values_ok(khc_accidents, X["main_table"][0], y) self.assertTrue(khc_accidents.is_multitable_model_) @@ -203,7 +203,7 @@ def test_regressor_attributes_monotable(self): adult_df = pd.read_csv(adult_dataset_path, sep="\t").sample(750) X = adult_df.drop("age", axis=1) y = adult_df["age"] - khr_adult = KhiopsRegressor(n_trees=0) + khr_adult = KhiopsRegressor(n_trees=0, n_text_features=0) with warnings.catch_warnings(): warnings.filterwarnings( action="ignore", @@ -225,7 +225,7 @@ def test_regressor_attributes_multitable(self): X, _ = self._create_multitable_input(750) y = X["main_table"][0]["Commune"] X["main_table"][0].drop("Commune", axis=1, inplace=True) - khr_accidents = KhiopsRegressor(n_trees=0) + khr_accidents = KhiopsRegressor(n_trees=0, n_text_features=0) with warnings.catch_warnings(): warnings.filterwarnings( action="ignore", @@ -262,7 +262,7 @@ def test_encoder_attributes_multitable(self): by Khiops post training. """ X, y = self._create_multitable_input() - khe_accidents = KhiopsEncoder(n_trees=5) + khe_accidents = KhiopsEncoder(n_trees=5, n_text_features=5000) khe_accidents.fit(X, y) self.assert_attribute_values_ok(khe_accidents, X, None) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index d43f4ce8..0bf3c8de 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -749,6 +749,8 @@ def setUpClass(cls): "header_line": True, "max_pairs": 1, "max_trees": 5, + "max_text_features": 300000, + "text_features": "ngrams", "max_selected_variables": 1, "max_evaluated_variables": 3, "specific_pairs": [("age", "race")], @@ -777,6 +779,8 @@ def setUpClass(cls): "detect_format": False, "header_line": True, "max_trees": 0, + "max_text_features": 300000, + "text_features": "ngrams", "max_selected_variables": 1, "max_evaluated_variables": 3, "construction_rules": ["TableMode", "TableSelection"], @@ -803,6 +807,8 @@ def setUpClass(cls): "header_line": True, "max_pairs": 1, "max_trees": 5, + "max_text_features": 300000, + "text_features": "ngrams", "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], @@ -841,6 +847,8 @@ def setUpClass(cls): "max_constructed_variables": 10, "max_pairs": 1, "max_trees": 5, + "max_text_features": 300000, + "text_features": "ngrams", "max_selected_variables": 1, "max_evaluated_variables": 3, "specific_pairs": [], @@ -870,6 +878,8 @@ def setUpClass(cls): "header_line": True, "max_constructed_variables": 10, "max_trees": 0, + "max_text_features": 300000, + "text_features": "ngrams", "max_selected_variables": 1, "max_evaluated_variables": 3, "construction_rules": ["TableMode", "TableSelection"], @@ -897,6 +907,8 @@ def setUpClass(cls): "max_constructed_variables": 10, "max_pairs": 1, "max_trees": 5, + "max_text_features": 300000, + "text_features": "ngrams", "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], @@ -1410,6 +1422,8 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe(self): extra_estimator_kwargs={ "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "n_selected_features": 1, "n_evaluated_features": 3, "specific_pairs": [("age", "race")], @@ -1431,6 +1445,8 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe_with_df_y( extra_estimator_kwargs={ "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "n_selected_features": 1, "n_evaluated_features": 3, "specific_pairs": [("age", "race")], @@ -1451,6 +1467,8 @@ def test_parameter_transfer_classifier_fit_from_multitable_dataframe(self): "n_features": 10, "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "n_selected_features": 1, "n_evaluated_features": 3, "specific_pairs": [], @@ -1488,6 +1506,8 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe(self): extra_estimator_kwargs={ "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], @@ -1512,6 +1532,8 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y( extra_estimator_kwargs={ "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "specific_pairs": [("age", "race")], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], @@ -1535,6 +1557,8 @@ def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self): "n_features": 10, "n_pairs": 1, "n_trees": 5, + "n_text_features": 300000, + "type_text_features": "ngrams", "specific_pairs": [], "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], @@ -1575,6 +1599,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe(self): extra_estimator_kwargs={ "n_selected_features": 1, "n_evaluated_features": 3, + "n_text_features": 300000, + "type_text_features": "ngrams", "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -1591,6 +1617,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y( extra_estimator_kwargs={ "n_selected_features": 1, "n_evaluated_features": 3, + "n_text_features": 300000, + "type_text_features": "ngrams", "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -1605,6 +1633,8 @@ def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self): extra_estimator_kwargs={ "n_features": 10, "n_trees": 0, + "n_text_features": 300000, + "type_text_features": "ngrams", "n_selected_features": 1, "n_evaluated_features": 3, "construction_rules": ["TableMode", "TableSelection"], @@ -1693,6 +1723,7 @@ def test_sklearn_check_estimator(self): # Set the estimators to test # Notes: # - We use n_trees=0 so the tests execute faster + # - We use n_text_features=0 so the tests execute faster # - We omit KhiopsCoclustering because he needs special inputs to work well # and sklearn's check_estimator method does not accept them. # - KhiopsEncoder: @@ -1701,10 +1732,11 @@ def test_sklearn_check_estimator(self): # - We set it with informative_features_only=False so it always have output # columns (sklearn estimator checks expect non-empty encoders) khiops_estimators = [ - KhiopsClassifier(n_trees=0), - KhiopsRegressor(n_trees=0), + KhiopsClassifier(n_trees=0, n_text_features=0), + KhiopsRegressor(n_trees=0, n_text_features=0), KhiopsEncoder( n_trees=0, + n_text_features=0, informative_features_only=False, transform_type_numerical="0-1_normalization", ), diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py index 40235bee..86d0f744 100644 --- a/tests/test_sklearn_output_types.py +++ b/tests/test_sklearn_output_types.py @@ -66,7 +66,7 @@ def test_classifier_output_types(self): "iris_sec": (raw_X_sec_mt, ["Id"]), }, } - khc = KhiopsClassifier(n_trees=0) + khc = KhiopsClassifier(n_trees=0, n_text_features=0) khc.fit(X, y) y_pred = khc.predict(X) khc.fit(X_mt, y) @@ -185,7 +185,7 @@ def test_classifier_output_types(self): estimator=KhiopsClassifier.__name__, ): # Train the classifier - khc = KhiopsClassifier(n_trees=0) + khc = KhiopsClassifier(n_trees=0, n_text_features=0) khc.fit(X, y) # Check the expected classes