Skip to content

Commit b9f51fa

Browse files
authored
Merge pull request #494 from KhiopsML/drop-importances-from-khiops-sklearn-estimator-attributes
Drop importances from khiops sklearn estimator attributes
2 parents be078f9 + dc8bbd9 commit b9f51fa

File tree

11 files changed

+79
-390
lines changed

11 files changed

+79
-390
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,23 @@
1313
comments, and dictionary and variable block internal comments.
1414
- (`core`) Dictionary `Rule` class and supporting API for serializing `Rule` instances.
1515
- (`core`) New way to add a variable to a dictionary using a complete specification.
16+
- (`core`) New API constants for rules used in automatic variable construction:
17+
- `DEFAULT_CONSTRUCTION_RULES`: names of table and entity construction rules,
18+
which are applied by default
19+
- `CALENDRICAL_CONSTRUCTION_RULES`: names of date, time and timestamp rules.
1620
- (`sklearn`) `Text` Khiops type support at the estimator level.
1721

1822
### Changed
1923
- (`core`) Dictionary API (DictionaryDomain, Dictionary, MetaData),
2024
when a requested key is not found in getters, return ``None`` instead
2125
of raising a `KeyError` exception.
2226

27+
### Removed
28+
- (`sklearn`) Remove the `n_features_evaluated_`, `feature_evaluated_names`,
29+
`feature_evaluated_importances_`, `n_features_used_`, `feature_used_names_`
30+
and `feature_used_importances_` Khiops classifier and regressor estimator
31+
attributes.
32+
2333
### Fixed
2434
- (General) Inconsistency between the `tools.download_datasets` function and the
2535
current samples directory according to `core.api.get_samples_dir()`.

doc/samples/samples_sklearn.rst

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,6 @@ Samples
6868
# Train the classifier
6969
khc.fit(X_train, y_train)
7070
71-
# Show the feature importance info
72-
print(f"Features evaluated: {khc.n_features_evaluated_}")
73-
print(f"Features selected : {khc.n_features_used_}")
74-
print("Top 3 used features")
75-
for i, feature in enumerate(khc.feature_used_names_[:3]):
76-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
77-
print("---")
78-
7971
# Predict the classes on the test dataset
8072
y_test_pred = khc.predict(X_test)
8173
print("Predicted classes (first 10):")
@@ -186,14 +178,6 @@ Samples
186178
# Train the classifier
187179
khc.fit(X_train, y_train)
188180
189-
# Show the feature importance info
190-
print(f"Features evaluated: {khc.n_features_evaluated_}")
191-
print(f"Features selected : {khc.n_features_used_}")
192-
print("Top 3 used features")
193-
for i, feature in enumerate(khc.feature_used_names_[:3]):
194-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
195-
print("---")
196-
197181
# Predict the classes on the test dataset
198182
y_test_pred = khc.predict(X_test)
199183
print("Predicted classes (first 10):")
@@ -307,14 +291,6 @@ Samples
307291
khc = KhiopsClassifier(n_trees=0)
308292
khc.fit(X_train, y_train)
309293
310-
# Show the feature importance info
311-
print(f"Features evaluated: {khc.n_features_evaluated_}")
312-
print(f"Features selected : {khc.n_features_used_}")
313-
print("Top 3 used features")
314-
for i, feature in enumerate(khc.feature_used_names_[:3]):
315-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
316-
print("---")
317-
318294
# Predict the class on the test dataset
319295
y_test_pred = khc.predict(X_test)
320296
print("Predicted classes (first 10):")
@@ -539,14 +515,6 @@ Samples
539515
# Train the regressor
540516
khr.fit(X_train, y_train)
541517
542-
# Show the feature importance info
543-
print(f"Features evaluated: {khr.n_features_evaluated_}")
544-
print(f"Features selected : {khr.n_features_used_}")
545-
print("Top 3 used features")
546-
for i, feature in enumerate(khr.feature_used_names_[:3]):
547-
print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}")
548-
print("---")
549-
550518
# Predict the values on the test dataset
551519
y_test_pred = khr.predict(X_test)
552520
print("Predicted values for 'age' (first 10):")
@@ -667,13 +635,6 @@ Samples
667635
khe = KhiopsEncoder(n_features=10)
668636
khe.fit(X, y)
669637
670-
# Show the feature importance info
671-
print(f"Features evaluated: {khe.n_features_evaluated_}")
672-
print("Top 3 evaluated features")
673-
for i, feature in enumerate(khe.feature_evaluated_names_[:3]):
674-
print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}")
675-
print("---")
676-
677638
# Transform the train dataset
678639
print("Encoded feature names:")
679640
print(khe.feature_names_out_)

khiops/core/api.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,10 @@
3131
from khiops.core.internals.runner import get_runner
3232
from khiops.core.internals.task import get_task_registry
3333

34-
# List of all available construction rules in the Khiops tool
35-
all_construction_rules = [
36-
"Day",
37-
"DecimalTime",
38-
"DecimalWeekDay",
39-
"DecimalYear",
40-
"DecimalYearTS",
41-
"GetDate",
42-
"GetTime",
34+
# Construction rules
35+
DEFAULT_CONSTRUCTION_RULES = [
4336
"GetValue",
4437
"GetValueC",
45-
"LocalTimestamp",
4638
"TableCount",
4739
"TableCountDistinct",
4840
"TableMax",
@@ -53,9 +45,37 @@
5345
"TableSelection",
5446
"TableStdDev",
5547
"TableSum",
48+
]
49+
"""List of construction rules that Khiops uses by default
50+
51+
.. note::
52+
These are all the multi-table rules.
53+
""" # pylint: disable=pointless-string-statement
54+
55+
CALENDRICAL_CONSTRUCTION_RULES = [
56+
"Day",
57+
"DecimalTime",
58+
"DecimalWeekDay",
59+
"DecimalYear",
60+
"DecimalYearTS",
61+
"GetDate",
62+
"GetTime",
63+
"LocalTimestamp",
5664
"WeekDay",
5765
"YearDay",
5866
]
67+
"""List of calendrical construction rules
68+
69+
These rules include: date, time and timestamp rules.
70+
71+
.. note::
72+
These rules are not enabled by default. The user needs to explicitly
73+
select each of them via the ``construction_rules`` parameter of the
74+
relevant Core API functions.
75+
""" # pylint: disable=pointless-string-statement
76+
77+
# List of all available construction rules in the Khiops tool
78+
ALL_CONSTRUCTION_RULES = DEFAULT_CONSTRUCTION_RULES + CALENDRICAL_CONSTRUCTION_RULES
5979

6080
##########################
6181
# Private module methods #
@@ -758,8 +778,9 @@ def train_predictor(
758778
max_constructed_variables : int, default 1000
759779
Maximum number of variables to construct.
760780
construction_rules : list of str, optional
761-
Allowed rules for the automatic variable construction. If not set it uses all
762-
possible rules.
781+
Allowed rules for the automatic variable construction. If not set, Khiops
782+
uses the multi-table construction rules listed in
783+
`DEFAULT_CONSTRUCTION_RULES`.
763784
max_text_features : int, default 10000
764785
Maximum number of text features to construct.
765786
text_features : str, default "words"
@@ -1190,21 +1211,22 @@ def train_recoder(
11901211
max_constructed_variables : int, default 100
11911212
Maximum number of variables to construct.
11921213
construction_rules : list of str, optional
1193-
Allowed rules for the automatic variable construction. If not set it uses all
1194-
possible rules.
1214+
Allowed rules for the automatic variable construction. If not set, Khiops
1215+
uses the multi-table construction rules listed in
1216+
`DEFAULT_CONSTRUCTION_RULES`.
11951217
max_text_features : int, default 10000
11961218
Maximum number of text features to construct.
11971219
text_features : str, default "words"
11981220
Type of the text features. Can be either one of:
11991221
1200-
- "words": sequences of non-space characters
1201-
- "ngrams": sequences of bytes
1202-
- "tokens": user-defined
1222+
- "words": sequences of non-space characters
1223+
- "ngrams": sequences of bytes
1224+
- "tokens": user-defined
12031225
12041226
max_trees : int, default 10
12051227
Maximum number of trees to construct.
12061228
max_pairs : int, default 0
1207-
Maximum number of variables pairs to construct.
1229+
Maximum number of variable pairs to construct.
12081230
specific_pairs : list of tuple, optional
12091231
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
12101232
contains only one non-empty feature name, then it generates all the pairs

khiops/core/dictionary.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ def get_value(self, key):
733733
734734
Returns
735735
-------
736-
`Metadata`
736+
`MetaData`
737737
Metadata value associated to the specified key. ``None`` is returned
738738
if the metadata key is not found.
739739
"""
@@ -1267,7 +1267,7 @@ def get_value(self, key):
12671267
12681268
Returns
12691269
-------
1270-
`Metadata`
1270+
`MetaData`
12711271
Metadata value associated to the specified key. ``None`` is returned
12721272
if the metadata key is not found.
12731273
"""
@@ -1527,7 +1527,7 @@ def get_value(self, key):
15271527
15281528
Returns
15291529
-------
1530-
`Metadata`
1530+
`MetaData`
15311531
Metadata value associated to the specified key. ``None`` is returned
15321532
if the metadata key is not found.
15331533
"""

khiops/samples/samples.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@
503503
"metadata": {},
504504
"source": [
505505
"### `train_predictor_mt_with_specific_rules()`\n\n",
506-
"Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.all_construction_rules``\n \n"
506+
"Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.ALL_CONSTRUCTION_RULES``\n \n"
507507
]
508508
},
509509
{

khiops/samples/samples.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def train_predictor_mt_with_specific_rules():
432432
433433
It is the same as `.train_predictor_mt` but with the specification of the allowed
434434
variable construction rules. The list of available rules is found in the field
435-
``kh.all_construction_rules``
435+
``kh.ALL_CONSTRUCTION_RULES``
436436
"""
437437
# Imports
438438
import os

khiops/samples/samples_sklearn.ipynb

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,6 @@
5454
"# Train the classifier\n",
5555
"khc.fit(X_train, y_train)\n",
5656
"\n",
57-
"# Show the feature importance info\n",
58-
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
59-
"print(f\"Features selected : {khc.n_features_used_}\")\n",
60-
"print(\"Top 3 used features\")\n",
61-
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
62-
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
63-
"print(\"---\")\n",
64-
"\n",
6557
"# Predict the classes on the test dataset\n",
6658
"y_test_pred = khc.predict(X_test)\n",
6759
"print(\"Predicted classes (first 10):\")\n",
@@ -198,14 +190,6 @@
198190
"# Train the classifier\n",
199191
"khc.fit(X_train, y_train)\n",
200192
"\n",
201-
"# Show the feature importance info\n",
202-
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
203-
"print(f\"Features selected : {khc.n_features_used_}\")\n",
204-
"print(\"Top 3 used features\")\n",
205-
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
206-
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
207-
"print(\"---\")\n",
208-
"\n",
209193
"# Predict the classes on the test dataset\n",
210194
"y_test_pred = khc.predict(X_test)\n",
211195
"print(\"Predicted classes (first 10):\")\n",
@@ -345,14 +329,6 @@
345329
"khc = KhiopsClassifier(n_trees=0)\n",
346330
"khc.fit(X_train, y_train)\n",
347331
"\n",
348-
"# Show the feature importance info\n",
349-
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
350-
"print(f\"Features selected : {khc.n_features_used_}\")\n",
351-
"print(\"Top 3 used features\")\n",
352-
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
353-
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
354-
"print(\"---\")\n",
355-
"\n",
356332
"# Predict the class on the test dataset\n",
357333
"y_test_pred = khc.predict(X_test)\n",
358334
"print(\"Predicted classes (first 10):\")\n",
@@ -629,14 +605,6 @@
629605
"# Train the regressor\n",
630606
"khr.fit(X_train, y_train)\n",
631607
"\n",
632-
"# Show the feature importance info\n",
633-
"print(f\"Features evaluated: {khr.n_features_evaluated_}\")\n",
634-
"print(f\"Features selected : {khr.n_features_used_}\")\n",
635-
"print(\"Top 3 used features\")\n",
636-
"for i, feature in enumerate(khr.feature_used_names_[:3]):\n",
637-
" print(f\"{feature} - Importance: {khr.feature_used_importances_[i][2]}\")\n",
638-
"print(\"---\")\n",
639-
"\n",
640608
"# Predict the values on the test dataset\n",
641609
"y_test_pred = khr.predict(X_test)\n",
642610
"print(\"Predicted values for 'age' (first 10):\")\n",
@@ -796,13 +764,6 @@
796764
"khe = KhiopsEncoder(n_features=10)\n",
797765
"khe.fit(X, y)\n",
798766
"\n",
799-
"# Show the feature importance info\n",
800-
"print(f\"Features evaluated: {khe.n_features_evaluated_}\")\n",
801-
"print(\"Top 3 evaluated features\")\n",
802-
"for i, feature in enumerate(khe.feature_evaluated_names_[:3]):\n",
803-
" print(f\"{feature} - Level: {khe.feature_evaluated_importances_[i]}\")\n",
804-
"print(\"---\")\n",
805-
"\n",
806767
"# Transform the train dataset\n",
807768
"print(\"Encoded feature names:\")\n",
808769
"print(khe.feature_names_out_)\n",

khiops/samples/samples_sklearn.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,6 @@ def khiops_classifier():
5757
# Train the classifier
5858
khc.fit(X_train, y_train)
5959

60-
# Show the feature importance info
61-
print(f"Features evaluated: {khc.n_features_evaluated_}")
62-
print(f"Features selected : {khc.n_features_used_}")
63-
print("Top 3 used features")
64-
for i, feature in enumerate(khc.feature_used_names_[:3]):
65-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
66-
print("---")
67-
6860
# Predict the classes on the test dataset
6961
y_test_pred = khc.predict(X_test)
7062
print("Predicted classes (first 10):")
@@ -181,14 +173,6 @@ def khiops_classifier_text():
181173
# Train the classifier
182174
khc.fit(X_train, y_train)
183175

184-
# Show the feature importance info
185-
print(f"Features evaluated: {khc.n_features_evaluated_}")
186-
print(f"Features selected : {khc.n_features_used_}")
187-
print("Top 3 used features")
188-
for i, feature in enumerate(khc.feature_used_names_[:3]):
189-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
190-
print("---")
191-
192176
# Predict the classes on the test dataset
193177
y_test_pred = khc.predict(X_test)
194178
print("Predicted classes (first 10):")
@@ -310,14 +294,6 @@ def khiops_classifier_multitable_snowflake():
310294
khc = KhiopsClassifier(n_trees=0)
311295
khc.fit(X_train, y_train)
312296

313-
# Show the feature importance info
314-
print(f"Features evaluated: {khc.n_features_evaluated_}")
315-
print(f"Features selected : {khc.n_features_used_}")
316-
print("Top 3 used features")
317-
for i, feature in enumerate(khc.feature_used_names_[:3]):
318-
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
319-
print("---")
320-
321297
# Predict the class on the test dataset
322298
y_test_pred = khc.predict(X_test)
323299
print("Predicted classes (first 10):")
@@ -550,14 +526,6 @@ def khiops_regressor():
550526
# Train the regressor
551527
khr.fit(X_train, y_train)
552528

553-
# Show the feature importance info
554-
print(f"Features evaluated: {khr.n_features_evaluated_}")
555-
print(f"Features selected : {khr.n_features_used_}")
556-
print("Top 3 used features")
557-
for i, feature in enumerate(khr.feature_used_names_[:3]):
558-
print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}")
559-
print("---")
560-
561529
# Predict the values on the test dataset
562530
y_test_pred = khr.predict(X_test)
563531
print("Predicted values for 'age' (first 10):")
@@ -696,13 +664,6 @@ def khiops_encoder_multitable_snowflake():
696664
khe = KhiopsEncoder(n_features=10)
697665
khe.fit(X, y)
698666

699-
# Show the feature importance info
700-
print(f"Features evaluated: {khe.n_features_evaluated_}")
701-
print("Top 3 evaluated features")
702-
for i, feature in enumerate(khe.feature_evaluated_names_[:3]):
703-
print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}")
704-
print("---")
705-
706667
# Transform the train dataset
707668
print("Encoded feature names:")
708669
print(khe.feature_names_out_)

0 commit comments

Comments
 (0)