Skip to content
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
comments, and dictionary and variable block internal comments.
- (`core`) Dictionary `Rule` class and supporting API for serializing `Rule` instances.
- (`core`) New way to add a variable to a dictionary using a complete specification.
- (`core`) New API constants for rules used in automatic variable construction:
- `DEFAULT_CONSTRUCTION_RULES`: names of table and entity construction rules,
which are applied by default
- `CALENDRICAL_CONSTRUCTION_RULES`: names of date, time and timestamp rules.
- (`sklearn`) `Text` Khiops type support at the estimator level.
- (`sklearn`) The `feature_names_in_` and `feature_importances_` Khiops
classifier and regressor estimator attributes.
Comment thread
folmos-at-orange marked this conversation as resolved.

### Changed
- (`core`) Dictionary API (DictionaryDomain, Dictionary, MetaData),
Expand Down
32 changes: 15 additions & 17 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,19 @@ Samples
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("Top 5 used features")
for i, feature in enumerate(khc.feature_used_names_[:5]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

print("Top 5 used features, among those present in the dataset")
for feature, importance in sorted(
zip(khc.feature_names_in_, khc.feature_importances_),
key=lambda imp: imp[1],
reverse=True,
)[:5]:
print(f"{feature} - Importance: {importance}")
print("---")

# Predict the classes on the test dataset
Expand Down Expand Up @@ -187,11 +195,10 @@ Samples
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

# Predict the classes on the test dataset
Expand Down Expand Up @@ -308,11 +315,10 @@ Samples
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

# Predict the class on the test dataset
Expand Down Expand Up @@ -540,11 +546,10 @@ Samples
khr.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khr.n_features_evaluated_}")
print(f"Features selected : {khr.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khr.feature_used_names_[:3]):
print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khr.feature_used_importances_[i]}")
print("---")

# Predict the values on the test dataset
Expand Down Expand Up @@ -667,13 +672,6 @@ Samples
khe = KhiopsEncoder(n_features=10)
khe.fit(X, y)

# Show the feature importance info
print(f"Features evaluated: {khe.n_features_evaluated_}")
print("Top 3 evaluated features")
for i, feature in enumerate(khe.feature_evaluated_names_[:3]):
print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}")
print("---")

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
Expand Down
58 changes: 40 additions & 18 deletions khiops/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,10 @@
from khiops.core.internals.runner import get_runner
from khiops.core.internals.task import get_task_registry

# List of all available construction rules in the Khiops tool
all_construction_rules = [
"Day",
"DecimalTime",
"DecimalWeekDay",
"DecimalYear",
"DecimalYearTS",
"GetDate",
"GetTime",
# Construction rules
DEFAULT_CONSTRUCTION_RULES = [
"GetValue",
"GetValueC",
"LocalTimestamp",
"TableCount",
"TableCountDistinct",
"TableMax",
Expand All @@ -53,9 +45,37 @@
"TableSelection",
"TableStdDev",
"TableSum",
]
"""List of construction rules that Khiops uses by default

.. note::
These are all the multi-table rules.
""" # pylint: disable=pointless-string-statement

CALENDRICAL_CONSTRUCTION_RULES = [
"Day",
"DecimalTime",
"DecimalWeekDay",
"DecimalYear",
"DecimalYearTS",
"GetDate",
"GetTime",
"LocalTimestamp",
"WeekDay",
"YearDay",
]
"""List of calendrical construction rules

These rules include: date, time and timestamp rules.

.. note::
These rules are not enabled by default. The user needs to explicitly
select each of them via the ``construction_rules`` parameter of the
relevant Core API functions.
""" # pylint: disable=pointless-string-statement

# List of all available construction rules in the Khiops tool
ALL_CONSTRUCTION_RULES = DEFAULT_CONSTRUCTION_RULES + CALENDRICAL_CONSTRUCTION_RULES

##########################
# Private module methods #
Expand Down Expand Up @@ -758,8 +778,9 @@ def train_predictor(
max_constructed_variables : int, default 1000
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
Allowed rules for the automatic variable construction. If not set, Khiops
uses the multi-table construction rules listed in
`DEFAULT_CONSTRUCTION_RULES`.
max_text_features : int, default 10000
Maximum number of text features to construct.
text_features : str, default "words"
Expand Down Expand Up @@ -1190,21 +1211,22 @@ def train_recoder(
max_constructed_variables : int, default 100
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
Allowed rules for the automatic variable construction. If not set, Khiops
uses the multi-table construction rules listed in
`DEFAULT_CONSTRUCTION_RULES`.
max_text_features : int, default 10000
Maximum number of text features to construct.
text_features : str, default "words"
Type of the text features. Can be either one of:

- "words": sequences of non-space characters
- "ngrams": sequences of bytes
- "tokens": user-defined
- "words": sequences of non-space characters
- "ngrams": sequences of bytes
- "tokens": user-defined

max_trees : int, default 10
Maximum number of trees to construct.
max_pairs : int, default 0
Maximum number of variables pairs to construct.
Maximum number of variable pairs to construct.
specific_pairs : list of tuple, optional
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
contains only one non-empty feature name, then it generates all the pairs
Expand Down
6 changes: 3 additions & 3 deletions khiops/core/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ def get_value(self, key):

Returns
-------
`Metadata`
`MetaData`
Metadata value associated to the specified key. ``None`` is returned
if the metadata key is not found.
"""
Expand Down Expand Up @@ -1267,7 +1267,7 @@ def get_value(self, key):

Returns
-------
`Metadata`
`MetaData`
Metadata value associated to the specified key. ``None`` is returned
if the metadata key is not found.
"""
Expand Down Expand Up @@ -1527,7 +1527,7 @@ def get_value(self, key):

Returns
-------
`Metadata`
`MetaData`
Metadata value associated to the specified key. ``None`` is returned
if the metadata key is not found.
"""
Expand Down
2 changes: 1 addition & 1 deletion khiops/samples/samples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@
"metadata": {},
"source": [
"### `train_predictor_mt_with_specific_rules()`\n\n",
"Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.all_construction_rules``\n \n"
"Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.ALL_CONSTRUCTION_RULES``\n \n"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion khiops/samples/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def train_predictor_mt_with_specific_rules():

It is the same as `.train_predictor_mt` but with the specification of the allowed
variable construction rules. The list of available rules is found in the field
``kh.all_construction_rules``
``kh.ALL_CONSTRUCTION_RULES``
"""
# Imports
import os
Expand Down
32 changes: 15 additions & 17 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,19 @@
"khc.fit(X_train, y_train)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
"print(f\"Features selected : {khc.n_features_used_}\")\n",
"print(\"Top 3 used features\")\n",
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
"print(\"Top 5 used features\")\n",
"for i, feature in enumerate(khc.feature_used_names_[:5]):\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n",
"print(\"---\")\n",
"\n",
"print(\"Top 5 used features, among those present in the dataset\")\n",
"for feature, importance in sorted(\n",
" zip(khc.feature_names_in_, khc.feature_importances_),\n",
" key=lambda imp: imp[1],\n",
" reverse=True,\n",
")[:5]:\n",
" print(f\"{feature} - Importance: {importance}\")\n",
"print(\"---\")\n",
"\n",
"# Predict the classes on the test dataset\n",
Expand Down Expand Up @@ -199,11 +207,10 @@
"khc.fit(X_train, y_train)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
"print(f\"Features selected : {khc.n_features_used_}\")\n",
"print(\"Top 3 used features\")\n",
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n",
"print(\"---\")\n",
"\n",
"# Predict the classes on the test dataset\n",
Expand Down Expand Up @@ -346,11 +353,10 @@
"khc.fit(X_train, y_train)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
"print(f\"Features selected : {khc.n_features_used_}\")\n",
"print(\"Top 3 used features\")\n",
"for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
" print(f\"{feature} - Importance: {khc.feature_used_importances_[i]}\")\n",
"print(\"---\")\n",
"\n",
"# Predict the class on the test dataset\n",
Expand Down Expand Up @@ -630,11 +636,10 @@
"khr.fit(X_train, y_train)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khr.n_features_evaluated_}\")\n",
"print(f\"Features selected : {khr.n_features_used_}\")\n",
"print(\"Top 3 used features\")\n",
"for i, feature in enumerate(khr.feature_used_names_[:3]):\n",
" print(f\"{feature} - Importance: {khr.feature_used_importances_[i][2]}\")\n",
" print(f\"{feature} - Importance: {khr.feature_used_importances_[i]}\")\n",
"print(\"---\")\n",
"\n",
"# Predict the values on the test dataset\n",
Expand Down Expand Up @@ -796,13 +801,6 @@
"khe = KhiopsEncoder(n_features=10)\n",
"khe.fit(X, y)\n",
"\n",
"# Show the feature importance info\n",
"print(f\"Features evaluated: {khe.n_features_evaluated_}\")\n",
"print(\"Top 3 evaluated features\")\n",
"for i, feature in enumerate(khe.feature_evaluated_names_[:3]):\n",
" print(f\"{feature} - Level: {khe.feature_evaluated_importances_[i]}\")\n",
"print(\"---\")\n",
"\n",
"# Transform the train dataset\n",
"print(\"Encoded feature names:\")\n",
"print(khe.feature_names_out_)\n",
Expand Down
32 changes: 15 additions & 17 deletions khiops/samples/samples_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,19 @@ def khiops_classifier():
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("Top 5 used features")
for i, feature in enumerate(khc.feature_used_names_[:5]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

print("Top 5 used features, among those present in the dataset")
for feature, importance in sorted(
zip(khc.feature_names_in_, khc.feature_importances_),
key=lambda imp: imp[1],
reverse=True,
)[:5]:
print(f"{feature} - Importance: {importance}")
print("---")

# Predict the classes on the test dataset
Expand Down Expand Up @@ -182,11 +190,10 @@ def khiops_classifier_text():
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

# Predict the classes on the test dataset
Expand Down Expand Up @@ -311,11 +318,10 @@ def khiops_classifier_multitable_snowflake():
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khc.feature_used_importances_[i]}")
print("---")

# Predict the class on the test dataset
Expand Down Expand Up @@ -551,11 +557,10 @@ def khiops_regressor():
khr.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khr.n_features_evaluated_}")
print(f"Features selected : {khr.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khr.feature_used_names_[:3]):
print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}")
print(f"{feature} - Importance: {khr.feature_used_importances_[i]}")
print("---")

# Predict the values on the test dataset
Expand Down Expand Up @@ -696,13 +701,6 @@ def khiops_encoder_multitable_snowflake():
khe = KhiopsEncoder(n_features=10)
khe.fit(X, y)

# Show the feature importance info
print(f"Features evaluated: {khe.n_features_evaluated_}")
print("Top 3 evaluated features")
for i, feature in enumerate(khe.feature_evaluated_names_[:3]):
print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}")
print("---")

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
Expand Down
Loading
Loading