Skip to content

Commit 9548b64

Browse files
committed
Add n_text_features and type_text features to Khiops sklearn supervised estimators
1 parent 111996c commit 9548b64

4 files changed

Lines changed: 90 additions & 8 deletions

File tree

khiops/sklearn/estimators.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,6 +1199,8 @@ def __init__(
11991199
self,
12001200
n_features=100,
12011201
n_trees=10,
1202+
n_text_features=10000,
1203+
type_text_features="words",
12021204
specific_pairs=None,
12031205
all_possible_pairs=True,
12041206
construction_rules=None,
@@ -1213,6 +1215,8 @@ def __init__(
12131215
)
12141216
self.n_features = n_features
12151217
self.n_trees = n_trees
1218+
self.n_text_features = n_text_features
1219+
self.type_text_features = type_text_features
12161220
self.specific_pairs = specific_pairs
12171221
self.all_possible_pairs = all_possible_pairs
12181222
self.construction_rules = construction_rules
@@ -1280,6 +1284,20 @@ def _fit_check_params(self, ds, **kwargs):
12801284
raise TypeError(type_error_message("n_trees", self.n_trees, int))
12811285
if self.n_trees < 0:
12821286
raise ValueError("'n_trees' must be positive")
1287+
if not isinstance(self.n_text_features, int):
1288+
raise TypeError(
1289+
type_error_message("n_text_features", self.n_text_features, int)
1290+
)
1291+
if self.n_text_features < 0:
1292+
raise ValueError("'n_text_features' must be positive")
1293+
if not isinstance(self.type_text_features, str):
1294+
raise TypeError(
1295+
type_error_message("type_text_features", self.type_text_features, str)
1296+
)
1297+
if self.type_text_features not in ("words", "ngrams", "tokens"):
1298+
raise ValueError(
1299+
"'type_text_features' must be among 'words', 'ngrams' or 'tokens'"
1300+
)
12831301
if self.construction_rules is not None:
12841302
if not is_list_like(self.construction_rules):
12851303
raise TypeError(
@@ -1373,6 +1391,8 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir):
13731391
# Rename parameters to be compatible with khiops.core
13741392
kwargs["max_constructed_variables"] = kwargs.pop("n_features")
13751393
kwargs["max_trees"] = kwargs.pop("n_trees")
1394+
kwargs["max_text_features"] = kwargs.pop("n_text_features")
1395+
kwargs["text_features"] = kwargs.pop("type_text_features")
13761396

13771397
# Add the additional_data_tables parameter
13781398
kwargs["additional_data_tables"] = additional_data_tables
@@ -1548,6 +1568,8 @@ def __init__(
15481568
self,
15491569
n_features=100,
15501570
n_trees=10,
1571+
n_text_features=10000,
1572+
type_text_features="words",
15511573
n_selected_features=0,
15521574
n_evaluated_features=0,
15531575
specific_pairs=None,
@@ -1560,6 +1582,8 @@ def __init__(
15601582
super().__init__(
15611583
n_features=n_features,
15621584
n_trees=n_trees,
1585+
n_text_features=n_text_features,
1586+
type_text_features=type_text_features,
15631587
specific_pairs=specific_pairs,
15641588
all_possible_pairs=all_possible_pairs,
15651589
construction_rules=construction_rules,
@@ -1722,6 +1746,13 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
17221746
combine other features, either native or constructed. These features usually
17231747
improve the classifier's performance at the cost of interpretability of the
17241748
model.
1749+
n_text_features : int, default 10000
1750+
Maximum number of text features to construct.
1751+
type_text_features : str, default "words"
1752+
Type of the text features to construct. Can be either one of:
1753+
- "words": sequences of non-space characters
1754+
- "ngrams": sequences of bytes
1755+
- "tokens": user-defined
17251756
n_selected_features : int, default 0
17261757
Maximum number of features to be selected in the SNB predictor. If equal to
17271758
0 it selects all the features kept in the training.
@@ -1813,6 +1844,8 @@ def __init__(
18131844
n_features=100,
18141845
n_pairs=0,
18151846
n_trees=10,
1847+
n_text_features=10000,
1848+
type_text_features="words",
18161849
n_selected_features=0,
18171850
n_evaluated_features=0,
18181851
specific_pairs=None,
@@ -1826,6 +1859,8 @@ def __init__(
18261859
super().__init__(
18271860
n_features=n_features,
18281861
n_trees=n_trees,
1862+
n_text_features=n_text_features,
1863+
type_text_features=type_text_features,
18291864
n_selected_features=n_selected_features,
18301865
n_evaluated_features=n_evaluated_features,
18311866
construction_rules=construction_rules,
@@ -2217,6 +2252,8 @@ def __init__(
22172252
self,
22182253
n_features=100,
22192254
n_trees=0,
2255+
n_text_features=10000,
2256+
type_text_features="words",
22202257
n_selected_features=0,
22212258
n_evaluated_features=0,
22222259
construction_rules=None,
@@ -2227,6 +2264,8 @@ def __init__(
22272264
super().__init__(
22282265
n_features=n_features,
22292266
n_trees=n_trees,
2267+
n_text_features=n_text_features,
2268+
type_text_features=type_text_features,
22302269
n_selected_features=n_selected_features,
22312270
n_evaluated_features=n_evaluated_features,
22322271
construction_rules=construction_rules,
@@ -2376,6 +2415,13 @@ class KhiopsEncoder(TransformerMixin, KhiopsSupervisedEstimator):
23762415
Maximum number of decision tree features to construct. The constructed trees
23772416
combine other features, either native or constructed. These features usually
23782417
improve a predictor's performance at the cost of interpretability of the model.
2418+
n_text_features : int, default 10000
2419+
Maximum number of text features to construct.
2420+
type_text_features : str, default "words"
2421+
Type of the text features to construct. Can be either one of:
2422+
- "words": sequences of non-space characters
2423+
- "ngrams": sequences of bytes
2424+
- "tokens": user-defined
23792425
specific_pairs : list of tuple, optional
23802426
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
23812427
contains only one non-empty feature name, then it generates all the pairs
@@ -2469,6 +2515,8 @@ def __init__(
24692515
n_features=100,
24702516
n_pairs=0,
24712517
n_trees=0,
2518+
n_text_features=10000,
2519+
type_text_features="words",
24722520
specific_pairs=None,
24732521
all_possible_pairs=True,
24742522
construction_rules=None,
@@ -2485,6 +2533,8 @@ def __init__(
24852533
super().__init__(
24862534
n_features=n_features,
24872535
n_trees=n_trees,
2536+
n_text_features=n_text_features,
2537+
type_text_features=type_text_features,
24882538
construction_rules=construction_rules,
24892539
verbose=verbose,
24902540
output_dir=output_dir,

tests/test_estimator_attributes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def test_classifier_attributes_multitable(self):
187187
by Khiops post training.
188188
"""
189189
X, y = self._create_multitable_input()
190-
khc_accidents = KhiopsClassifier(n_trees=0, n_pairs=10)
190+
khc_accidents = KhiopsClassifier(n_trees=0, n_text_features=0, n_pairs=10)
191191
khc_accidents.fit(X, y)
192192
self.assert_attribute_values_ok(khc_accidents, X["main_table"][0], y)
193193
self.assertTrue(khc_accidents.is_multitable_model_)
@@ -203,7 +203,7 @@ def test_regressor_attributes_monotable(self):
203203
adult_df = pd.read_csv(adult_dataset_path, sep="\t").sample(750)
204204
X = adult_df.drop("age", axis=1)
205205
y = adult_df["age"]
206-
khr_adult = KhiopsRegressor(n_trees=0)
206+
khr_adult = KhiopsRegressor(n_trees=0, n_text_features=0)
207207
with warnings.catch_warnings():
208208
warnings.filterwarnings(
209209
action="ignore",
@@ -225,7 +225,7 @@ def test_regressor_attributes_multitable(self):
225225
X, _ = self._create_multitable_input(750)
226226
y = X["main_table"][0]["Commune"]
227227
X["main_table"][0].drop("Commune", axis=1, inplace=True)
228-
khr_accidents = KhiopsRegressor(n_trees=0)
228+
khr_accidents = KhiopsRegressor(n_trees=0, n_text_features=0)
229229
with warnings.catch_warnings():
230230
warnings.filterwarnings(
231231
action="ignore",
@@ -262,7 +262,7 @@ def test_encoder_attributes_multitable(self):
262262
by Khiops post training.
263263
"""
264264
X, y = self._create_multitable_input()
265-
khe_accidents = KhiopsEncoder(n_trees=5)
265+
khe_accidents = KhiopsEncoder(n_trees=5, n_text_features=5000)
266266
khe_accidents.fit(X, y)
267267

268268
self.assert_attribute_values_ok(khe_accidents, X, None)

tests/test_sklearn.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,8 @@ def setUpClass(cls):
749749
"header_line": True,
750750
"max_pairs": 1,
751751
"max_trees": 5,
752+
"max_text_features": 300000,
753+
"text_features": "ngrams",
752754
"max_selected_variables": 1,
753755
"max_evaluated_variables": 3,
754756
"specific_pairs": [("age", "race")],
@@ -777,6 +779,8 @@ def setUpClass(cls):
777779
"detect_format": False,
778780
"header_line": True,
779781
"max_trees": 0,
782+
"max_text_features": 300000,
783+
"text_features": "ngrams",
780784
"max_selected_variables": 1,
781785
"max_evaluated_variables": 3,
782786
"construction_rules": ["TableMode", "TableSelection"],
@@ -803,6 +807,8 @@ def setUpClass(cls):
803807
"header_line": True,
804808
"max_pairs": 1,
805809
"max_trees": 5,
810+
"max_text_features": 300000,
811+
"text_features": "ngrams",
806812
"specific_pairs": [("age", "race")],
807813
"all_possible_pairs": False,
808814
"construction_rules": ["TableMode", "TableSelection"],
@@ -841,6 +847,8 @@ def setUpClass(cls):
841847
"max_constructed_variables": 10,
842848
"max_pairs": 1,
843849
"max_trees": 5,
850+
"max_text_features": 300000,
851+
"text_features": "ngrams",
844852
"max_selected_variables": 1,
845853
"max_evaluated_variables": 3,
846854
"specific_pairs": [],
@@ -870,6 +878,8 @@ def setUpClass(cls):
870878
"header_line": True,
871879
"max_constructed_variables": 10,
872880
"max_trees": 0,
881+
"max_text_features": 300000,
882+
"text_features": "ngrams",
873883
"max_selected_variables": 1,
874884
"max_evaluated_variables": 3,
875885
"construction_rules": ["TableMode", "TableSelection"],
@@ -897,6 +907,8 @@ def setUpClass(cls):
897907
"max_constructed_variables": 10,
898908
"max_pairs": 1,
899909
"max_trees": 5,
910+
"max_text_features": 300000,
911+
"text_features": "ngrams",
900912
"specific_pairs": [],
901913
"all_possible_pairs": False,
902914
"construction_rules": ["TableMode", "TableSelection"],
@@ -1410,6 +1422,8 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe(self):
14101422
extra_estimator_kwargs={
14111423
"n_pairs": 1,
14121424
"n_trees": 5,
1425+
"n_text_features": 300000,
1426+
"type_text_features": "ngrams",
14131427
"n_selected_features": 1,
14141428
"n_evaluated_features": 3,
14151429
"specific_pairs": [("age", "race")],
@@ -1431,6 +1445,8 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe_with_df_y(
14311445
extra_estimator_kwargs={
14321446
"n_pairs": 1,
14331447
"n_trees": 5,
1448+
"n_text_features": 300000,
1449+
"type_text_features": "ngrams",
14341450
"n_selected_features": 1,
14351451
"n_evaluated_features": 3,
14361452
"specific_pairs": [("age", "race")],
@@ -1451,6 +1467,8 @@ def test_parameter_transfer_classifier_fit_from_multitable_dataframe(self):
14511467
"n_features": 10,
14521468
"n_pairs": 1,
14531469
"n_trees": 5,
1470+
"n_text_features": 300000,
1471+
"type_text_features": "ngrams",
14541472
"n_selected_features": 1,
14551473
"n_evaluated_features": 3,
14561474
"specific_pairs": [],
@@ -1488,6 +1506,8 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe(self):
14881506
extra_estimator_kwargs={
14891507
"n_pairs": 1,
14901508
"n_trees": 5,
1509+
"n_text_features": 300000,
1510+
"type_text_features": "ngrams",
14911511
"specific_pairs": [("age", "race")],
14921512
"all_possible_pairs": False,
14931513
"construction_rules": ["TableMode", "TableSelection"],
@@ -1512,6 +1532,8 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y(
15121532
extra_estimator_kwargs={
15131533
"n_pairs": 1,
15141534
"n_trees": 5,
1535+
"n_text_features": 300000,
1536+
"type_text_features": "ngrams",
15151537
"specific_pairs": [("age", "race")],
15161538
"all_possible_pairs": False,
15171539
"construction_rules": ["TableMode", "TableSelection"],
@@ -1535,6 +1557,8 @@ def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self):
15351557
"n_features": 10,
15361558
"n_pairs": 1,
15371559
"n_trees": 5,
1560+
"n_text_features": 300000,
1561+
"type_text_features": "ngrams",
15381562
"specific_pairs": [],
15391563
"all_possible_pairs": False,
15401564
"construction_rules": ["TableMode", "TableSelection"],
@@ -1575,6 +1599,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe(self):
15751599
extra_estimator_kwargs={
15761600
"n_selected_features": 1,
15771601
"n_evaluated_features": 3,
1602+
"n_text_features": 300000,
1603+
"type_text_features": "ngrams",
15781604
"construction_rules": ["TableMode", "TableSelection"],
15791605
},
15801606
)
@@ -1591,6 +1617,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y(
15911617
extra_estimator_kwargs={
15921618
"n_selected_features": 1,
15931619
"n_evaluated_features": 3,
1620+
"n_text_features": 300000,
1621+
"type_text_features": "ngrams",
15941622
"construction_rules": ["TableMode", "TableSelection"],
15951623
},
15961624
)
@@ -1605,6 +1633,8 @@ def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self):
16051633
extra_estimator_kwargs={
16061634
"n_features": 10,
16071635
"n_trees": 0,
1636+
"n_text_features": 300000,
1637+
"type_text_features": "ngrams",
16081638
"n_selected_features": 1,
16091639
"n_evaluated_features": 3,
16101640
"construction_rules": ["TableMode", "TableSelection"],
@@ -1693,6 +1723,7 @@ def test_sklearn_check_estimator(self):
16931723
# Set the estimators to test
16941724
# Notes:
16951725
# - We use n_trees=0 so the tests execute faster
1726+
# - We use n_text_features=0 so the tests execute faster
16961727
# - We omit KhiopsCoclustering because he needs special inputs to work well
16971728
# and sklearn's check_estimator method does not accept them.
16981729
# - KhiopsEncoder:
@@ -1701,10 +1732,11 @@ def test_sklearn_check_estimator(self):
17011732
# - We set it with informative_features_only=False so it always have output
17021733
# columns (sklearn estimator checks expect non-empty encoders)
17031734
khiops_estimators = [
1704-
KhiopsClassifier(n_trees=0),
1705-
KhiopsRegressor(n_trees=0),
1735+
KhiopsClassifier(n_trees=0, n_text_features=0),
1736+
KhiopsRegressor(n_trees=0, n_text_features=0),
17061737
KhiopsEncoder(
17071738
n_trees=0,
1739+
n_text_features=0,
17081740
informative_features_only=False,
17091741
transform_type_numerical="0-1_normalization",
17101742
),

tests/test_sklearn_output_types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_classifier_output_types(self):
6666
"iris_sec": (raw_X_sec_mt, ["Id"]),
6767
},
6868
}
69-
khc = KhiopsClassifier(n_trees=0)
69+
khc = KhiopsClassifier(n_trees=0, n_text_features=0)
7070
khc.fit(X, y)
7171
y_pred = khc.predict(X)
7272
khc.fit(X_mt, y)
@@ -185,7 +185,7 @@ def test_classifier_output_types(self):
185185
estimator=KhiopsClassifier.__name__,
186186
):
187187
# Train the classifier
188-
khc = KhiopsClassifier(n_trees=0)
188+
khc = KhiopsClassifier(n_trees=0, n_text_features=0)
189189
khc.fit(X, y)
190190

191191
# Check the expected classes

0 commit comments

Comments
 (0)