| orphan: |
|---|
.. currentmodule:: samples
The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.core` module.
The samples in this page are also available as:
- :download:`Python script <../../khiops/samples/samples.py>`
- :download:`Jupyter notebook <../../khiops/samples/samples.ipynb>`
First make sure you have installed the sample datasets. In a configured conda shell (ex. Anaconda Prompt in Windows) execute:
kh-download-datasetsIf that doesn't work open a python console and execute:
from khiops.tools import download_datasets
download_datasets().. autofunction:: get_khiops_version
print(f"Khiops version: {kh.get_khiops_version()}").. autofunction:: build_dictionary_from_data_table
# Imports
import os
from khiops import core as kh
# Set the file paths
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
dictionary_name = "AutoAdult"
dictionary_file_path = os.path.join(
"kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic"
)
# Create the dictionary from the data table
kh.build_dictionary_from_data_table(
data_table_path, dictionary_name, dictionary_file_path
).. autofunction:: create_dictionary_domain
# Imports
import os
from khiops import core as kh
# Create a Root dictionary
root_dictionary = kh.Dictionary(
json_data={"name": "dict_from_scratch", "root": True, "key": ["Id"]}
)
# Start with simple variables to declare
simple_variables = [
{"name": "Id", "type": "Categorical"},
{"name": "Num", "type": "Numerical"},
{"name": "text", "type": "Text"},
{"name": "hour", "type": "Time"},
{"name": "date", "type": "Date"},
{"name": "ambiguous_ts", "type": "Timestamp"},
{"name": "ts", "type": "TimestampTZ"},
]
for var_spec in simple_variables:
var = kh.Variable()
var.name = var_spec["name"]
var.type = var_spec["type"]
root_dictionary.add_variable(var)
# Create a second dictionary
second_dictionary = kh.Dictionary(
json_data={"name": "Service", "key": ["Id", "id_product"]}
)
second_dictionary.add_variable(
kh.Variable(json_data={"name": "Id", "type": "Categorical"})
)
second_dictionary.add_variable(
kh.Variable(json_data={"name": "id_product", "type": "Categorical"})
)
# Create a third dictionary
third_dictionary = kh.Dictionary(json_data={"name": "Address", "key": ["Id"]})
third_dictionary.add_variable(
kh.Variable(json_data={"name": "StreetNumber", "type": "Numerical"})
)
third_dictionary.add_variable(
kh.Variable(json_data={"name": "StreetName", "type": "Categorical"})
)
third_dictionary.add_variable(
kh.Variable(json_data={"name": "id_city", "type": "Categorical"})
)
# Add the variables used in a multi-table context in the first dictionary.
# They link the root dictionary to the additional ones
root_dictionary.add_variable(
kh.Variable(json_data={"name": "Services", "type": "Table(Service)"})
)
root_dictionary.add_variable(
kh.Variable(json_data={"name": "Address", "type": "Entity(Address)"})
)
# Create a DictionaryDomain (set of dictionaries)
dictionary_domain = kh.DictionaryDomain()
dictionary_domain.add_dictionary(root_dictionary)
dictionary_domain.add_dictionary(second_dictionary)
dictionary_domain.add_dictionary(third_dictionary)
output_dir = os.path.join("kh_samples", "create_dictionary_domain")
dictionary_file_path = os.path.join(output_dir, "dict_from_scratch.kdic")
# Create the output directory if needed
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
# Write the dictionary domain to a file
dictionary_domain.export_khiops_dictionary_file(dictionary_file_path).. autofunction:: detect_data_table_format
# Imports
import os
from khiops import core as kh
# Set the file paths
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
output_dir = os.path.join("kh_samples", "detect_data_table_format")
transformed_data_table_path = os.path.join(output_dir, "AdultWithAnotherFormat.txt")
# Create the output directory
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
# Detect the format of the table
format_spec = kh.detect_data_table_format(data_table_path)
print("Format specification (header_line, field_separator)")
print("Format detected on original table:", format_spec)
# Make a deployment to change the format of the data table
kh.deploy_model(
dictionary_file_path,
"Adult",
data_table_path,
transformed_data_table_path,
output_header_line=False,
output_field_separator=",",
)
# Detect the new format of the table without a dictionary file
format_spec = kh.detect_data_table_format(transformed_data_table_path)
print("Format detected on reformatted table:", format_spec)
# Detect the new format of the table with a dictionary file
format_spec = kh.detect_data_table_format(
transformed_data_table_path,
dictionary_file_path_or_domain=dictionary_file_path,
dictionary_name="Adult",
)
print("Format detected (with dictionary file) on reformatted table:", format_spec).. autofunction:: check_database
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
log_file = os.path.join("kh_samples", "check_database", "check_database.log")
# Check the database
kh.check_database(
dictionary_file_path,
"Adult",
data_table_path,
log_file_path=log_file,
max_messages=50,
).. autofunction:: export_dictionary_files
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
output_dir = os.path.join("kh_samples", "export_dictionary_files")
output_dictionary_file_path = os.path.join(output_dir, "ModifiedAdult.kdic")
output_dictionary_json_path = os.path.join(output_dir, "ModifiedAdult.kdicj")
alt_output_dictionary_json_path = os.path.join(output_dir, "AltModifiedAdult.kdicj")
# Load the dictionary domain from initial dictionary file
# Then obtain the "Adult" dictionary within
domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = domain.get_dictionary("Adult")
# Set some of its variables to unused
fnlwgt_variable = dictionary.get_variable("fnlwgt")
fnlwgt_variable.used = False
label_variable = dictionary.get_variable("Label")
label_variable.used = False
# Create output directory if necessary
if not os.path.exists("kh_samples"):
os.mkdir("kh_samples")
os.mkdir(output_dir)
else:
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# Export to kdic
domain.export_khiops_dictionary_file(output_dictionary_file_path)
# Export to kdicj either from the domain or from a kdic file
# Requires a Khiops execution, that's why it is not a method of DictionaryDomain
kh.export_dictionary_as_json(domain, output_dictionary_json_path)
kh.export_dictionary_as_json(
output_dictionary_file_path, alt_output_dictionary_json_path
).. autofunction:: train_predictor
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
analysis_report_file_path = os.path.join(
"kh_samples", "train_predictor", "AnalysisReport.khj"
)
# Train the predictor
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
analysis_report_file_path,
max_trees=0,
).. autofunction:: train_predictor_file_paths
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples", "train_predictor_file_paths", "AnalysisResults.khj"
)
# Train the predictor
_, modeling_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
)
print("Reports file available at " + report_file_path)
print("Modeling dictionary file available at " + modeling_dictionary_file_path)
# If you have Khiops Visualization installed you may open the report as follows
# kh.visualize_report(report_file_path).. autofunction:: train_predictor_text
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
)
data_table_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
report_file_path = os.path.join(
"kh_samples", "train_predictor_text", "AnalysisResults.khj"
)
# Train the predictor
kh.train_predictor(
dictionary_file_path,
"FlightNegativeTweets",
data_table_path,
"negativereason",
report_file_path,
max_trees=5,
max_text_features=1000,
text_features="words",
).. autofunction:: train_predictor_error_handling
# Imports
import os
from khiops import core as kh
# Set the file paths with a nonexistent dictionary file
dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic"
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_error_handling")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
log_file_path = os.path.join(output_dir, "khiops.log")
scenario_path = os.path.join(output_dir, "scenario._kh")
# Train the predictor and handle the error
try:
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
trace=True,
log_file_path=log_file_path,
output_scenario_path=scenario_path,
)
except kh.KhiopsRuntimeError as error:
print("Khiops training failed! Below the KhiopsRuntimeError message:")
print(error)
print("\nFull log contents:")
print("------------------")
with open(log_file_path) as log_file:
for line in log_file:
print(line, end="")
print("\nExecuted scenario")
print("-----------------")
with open(scenario_path) as scenario_file:
for line in scenario_file:
print(line, end="").. autofunction:: train_predictor_mt
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
"kh_samples", "train_predictor_mt", "AnalysisResults.khj"
)
# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_trees=0,
).. autofunction:: train_predictor_mt_with_specific_rules
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
"kh_samples",
"train_predictor_mt_with_specific_rules",
"AnalysisResults.khj",
)
# Train the predictor. Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
# - The maximum number of aggregate variables to construct (1000)
# - The construction rules allowed to automatically create aggregates
# - To not construct any decision tree
kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_constructed_variables=1000,
construction_rules=["TableMode", "TableSelection"],
max_trees=0,
).. autofunction:: train_predictor_mt_snowflake
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
users_table_path = os.path.join(accidents_dir, "Users.txt")
places_table_path = os.path.join(accidents_dir, "Places.txt")
report_file_path = os.path.join(
"kh_samples", "train_predictor_mt_snowflake", "AnalysisResults.khj"
)
# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={
"Vehicles": vehicles_table_path,
"Vehicles/Users": users_table_path,
"Place": places_table_path,
},
max_trees=0,
).. autofunction:: train_predictor_with_train_percentage
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples",
"train_predictor_with_train_percentage",
"P90_AnalysisResults.khj",
)
# Train the predictor. Besides the mandatory parameters, it is specified:
# - A 90% sampling rate for the training dataset
# - Set the test dataset as the complement of the training dataset (10%)
# - No trees
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
sample_percentage=90,
use_complement_as_test=True,
max_trees=0,
).. autofunction:: train_predictor_with_trees
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt")
report_file_path = os.path.join(
"kh_samples", "train_predictor_with_trees", "P80_AnalysisResults.khj"
)
# Train the predictor with at most 15 trees (default 10)
kh.train_predictor(
dictionary_file_path,
"Letter",
data_table_path,
"lettr",
report_file_path,
sample_percentage=80,
use_complement_as_test=True,
max_trees=15,
).. autofunction:: train_predictor_with_pairs
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples", "train_predictor_with_pairs", "AnalysisResults.khj"
)
# Train the predictor with at most 10 pairs as follows:
# - Include pairs age-race and capital_gain-capital_loss
# - Include all possible pairs having relationship as component
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
use_complement_as_test=True,
max_trees=0,
max_pairs=10,
specific_pairs=[
("age", "race"),
("capital_gain", "capital_loss"),
("relationship", ""),
],
).. autofunction:: train_predictor_with_multiple_parameters
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_script_path = os.path.join(output_dir, "output_scenario._kh")
log_path = os.path.join(output_dir, "log.txt")
# Train the predictor. Besides the mandatory parameters, we specify:
# - The value "more" as main target value
# - The output Khiops script file location (generic)
# - The log file location (generic)
# - The maximum memory used, set to 1000 MB
# - To show the debug trace (generic)
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
main_target_value="more",
output_scenario_path=output_script_path,
log_file_path=log_path,
memory_limit_mb=1000,
trace=True,
).. autofunction:: train_predictor_detect_format
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
output_dir = os.path.join("kh_samples", "train_predictor_detect_format")
transformed_data_table_path = os.path.join(output_dir, "TransformedIris.txt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
# Transform the database format from header_line=True and field_separator=TAB
# to header_line=False and field_separator=","
# See the deploy_model examples below for more details
kh.deploy_model(
dictionary_file_path,
"Iris",
data_table_path,
transformed_data_table_path,
output_header_line=False,
output_field_separator=",",
)
# Try to learn with the old format
try:
kh.train_predictor(
dictionary_file_path,
"Iris",
transformed_data_table_path,
"Class",
report_file_path,
header_line=True,
field_separator="",
)
except kh.KhiopsRuntimeError as error:
print(
"This failed because of a bad data table format spec. "
+ "Below the KhiopsRuntimeError message"
)
print(error)
# Train without specifyng the format (detect_format is True by default)
kh.train_predictor(
dictionary_file_path,
"Iris",
transformed_data_table_path,
"Class",
report_file_path,
).. autofunction:: train_predictor_with_cross_validation
# Imports
import math
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation")
fold_dictionary_file_path = os.path.join(output_dir, "AdultWithFolding.kdic")
# Create the output directory
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
# Load the learning dictionary object
domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = domain.get_dictionary("Adult")
# Add a random fold index variable to the learning dictionary
fold_number = 5
fold_index_variable = kh.Variable()
fold_index_variable.name = "FoldIndex"
fold_index_variable.type = "Numerical"
fold_index_variable.used = False
dictionary.add_variable(fold_index_variable)
# Create fold indexing rule and set it on `fold_index_variable`
dictionary.get_variable(fold_index_variable.name).set_rule(
kh.Rule("Ceil", kh.Rule("Product", fold_number, kh.Rule("Random()"))),
)
# Add variables that indicate if the instance is in the train dataset:
for fold_index in range(1, fold_number + 1):
is_in_train_dataset_variable = kh.Variable()
is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index)
is_in_train_dataset_variable.type = "Numerical"
is_in_train_dataset_variable.used = False
dictionary.add_variable(is_in_train_dataset_variable)
dictionary.get_variable(is_in_train_dataset_variable.name).set_rule(
kh.Rule("NEQ", fold_index_variable, fold_index),
)
# Print dictionary with fold variables
print("Dictionary file with fold variables")
domain.export_khiops_dictionary_file(fold_dictionary_file_path)
with open(fold_dictionary_file_path) as fold_dictionary_file:
for line in fold_dictionary_file:
print(line, end="")
# For each fold k:
print("Training Adult with " + str(fold_number) + " folds")
print("\tfold\ttrain auc\ttest auc")
train_aucs = []
test_aucs = []
for fold_index in range(1, fold_number + 1):
analysis_report_file_path = os.path.join(
output_dir, "Fold" + str(fold_index) + "AnalysisResults.khj"
)
# Train a model from the sub-dataset where IsInTrainDataset<k> is 1
_, modeling_dictionary_file_path = kh.train_predictor(
domain,
"Adult",
data_table_path,
"class",
analysis_report_file_path,
sample_percentage=100,
selection_variable="IsInTrainDataset" + str(fold_index),
selection_value=1,
max_trees=0,
)
evaluation_report_file_path = os.path.join(
output_dir, "Fold" + str(fold_index) + "AdultEvaluationResults.khj"
)
# Evaluate the resulting model in the subsets where IsInTrainDataset is 0
test_evaluation_report_path = kh.evaluate_predictor(
modeling_dictionary_file_path,
"SNB_Adult",
data_table_path,
evaluation_report_file_path,
sample_percentage=100,
selection_variable="IsInTrainDataset" + str(fold_index),
selection_value=0,
)
# Obtain the train AUC from the train report and the test AUC from the
# evaluation report and print them
train_results = kh.read_analysis_results_file(analysis_report_file_path)
test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path)
train_auc = train_results.train_evaluation_report.get_snb_performance().auc
test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc
print("\t" + str(fold_index) + "\t" + str(train_auc) + "\t" + str(test_auc))
# Store the train and test AUCs in arrays
train_aucs.append(train_auc)
test_aucs.append(test_auc)
# Print the mean +- error aucs for both train and test
mean_train_auc = sum(train_aucs) / fold_number
squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs]
sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1))
mean_test_auc = sum(test_aucs) / fold_number
squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs]
sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1))
print("final auc")
print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc))
print("test auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc)).. autofunction:: interpret_predictor
# Imports
import os
from khiops import core as kh
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "interpret_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic")
# Build prediction model
_, predictor_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
analysis_report_file_path,
)
# Build interpretation model
kh.interpret_predictor(predictor_file_path, "SNB_Adult", interpretor_file_path)
print(f"The interpretation model is '{interpretor_file_path}'").. autofunction:: reinforce_predictor
# Imports
import os
from khiops import core as kh
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "reinforce_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
reinforced_predictor_file_path = os.path.join(output_dir, "ReinforcedAdultModel.kdic")
# Build prediction model
_, predictor_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
analysis_report_file_path,
)
# Build reinforced predictor
kh.reinforce_predictor(
predictor_file_path,
"SNB_Adult",
reinforced_predictor_file_path,
reinforcement_lever_variables=["occupation"],
)
print(f"The reinforced predictor is '{reinforced_predictor_file_path}'").. autofunction:: multiple_train_predictor
# Imports
import os
from khiops import core as kh
def display_test_results(json_result_file_path):
"""Display some of the training results"""
results = kh.read_analysis_results_file(json_result_file_path)
train_performance = results.train_evaluation_report.get_snb_performance()
test_performance = results.test_evaluation_report.get_snb_performance()
print(
"\t"
+ str(len(results.preparation_report.variables_statistics))
+ "\t"
+ str(train_performance.auc)
+ "\t"
+ str(test_performance.auc)
)
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "multiple_train_predictor")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
# Read the dictionary file to obtain an instance of class Dictionary
dictionary_domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = dictionary_domain.get_dictionary("Adult")
# Train a SNB model using all the variables
print("\t#vars\ttrain auc\ttest auc")
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
sample_percentage=70,
use_complement_as_test=True,
max_trees=0,
)
display_test_results(report_file_path)
# Read results to obtain the variables sorted by decreasing Level
analysis_results = kh.read_analysis_results_file(report_file_path)
preparation_results = analysis_results.preparation_report
# Train a sequence of models with a decreasing number of variables
# We disable variables one-by-one in increasing level (predictive power) order
variable_number = len(preparation_results.variables_statistics)
for i in reversed(range(variable_number)):
# Search the next variable
variable = preparation_results.variables_statistics[i]
# Disable this variable and save the dictionary with the Khiops format
dictionary.get_variable(variable.name).used = False
# Train the model with this dictionary domain object
report_file_path = os.path.join(
output_dir, f"V{variable_number - 1 - i}_AnalysisResults.khj"
)
kh.train_predictor(
dictionary_domain,
"Adult",
data_table_path,
"class",
report_file_path,
sample_percentage=70,
use_complement_as_test=True,
max_trees=0,
)
# Show a preview of the results
display_test_results(report_file_path).. autofunction:: evaluate_predictor
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "evaluate_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
analysis_report_file_path,
max_trees=0,
)
evaluation_report_file_path = os.path.join(output_dir, "AdultEvaluationResults.khj")
# Evaluate the predictor
kh.evaluate_predictor(
model_dictionary_file_path,
"SNB_Adult",
data_table_path,
evaluation_report_file_path,
)
print("Evaluation report available at " + evaluation_report_file_path).. autofunction:: access_predictor_evaluation_report
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples", "access_predictor_evaluation_report", "AdultAnalysisReport.khj"
)
# Train the SNB predictor and some univariate predictors
# Note: Evaluation in test is 30% by default
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
)
# Obtain the evaluation results
results = kh.read_analysis_results_file(report_file_path)
evaluation_report = results.test_evaluation_report
snb_performance = evaluation_report.get_snb_performance()
# Print univariate metrics for the SNB
print("\nperformance metrics for " + snb_performance.name)
for metric_name in snb_performance.get_metric_names():
print(metric_name + ": " + str(snb_performance.get_metric(metric_name)))
# Print the confusion matrix
print("\nconfusion matrix:")
confusion_matrix = snb_performance.confusion_matrix
for target_value in confusion_matrix.values:
print("\t" + target_value, end="")
print("")
for i, target_value in enumerate(confusion_matrix.values):
observed_frequencies = confusion_matrix.matrix[i]
print(target_value, end="")
for frequency in observed_frequencies:
print("\t" + str(frequency), end="")
print("")
# Print the head of the lift curves for the 'more' modality
print("\nfirst five values of the lift curves for 'more'")
snb_lift_curve = evaluation_report.get_snb_lift_curve("more")
optimal_lift_curve = evaluation_report.get_classifier_lift_curve("Optimal", "more")
random_lift_curve = evaluation_report.get_classifier_lift_curve("Random", "more")
for i in range(5):
print(
str(snb_lift_curve.values[i])
+ "\t"
+ str(optimal_lift_curve.values[i])
+ "\t"
+ str(random_lift_curve.values[i])
)
# Print metrics for an SNB predictor
predictor_performance = evaluation_report.get_predictor_performance(
"Selective Naive Bayes"
)
print("\n\nperformance metrics for " + predictor_performance.name)
for metric_name in predictor_performance.get_metric_names():
print(metric_name + ": " + str(predictor_performance.get_metric(metric_name))).. autofunction:: train_recoder
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join("kh_samples", "train_recoder", "AnalysisResults.khj")
# Train the recoder model
kh.train_recoder(
dictionary_file_path, "Adult", data_table_path, "class", report_file_path
).. autofunction:: train_recoder_with_multiple_parameters
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples",
"train_recoder_with_multiple_parameters",
"AnalysisResults.khj",
)
# Train the recoder model
kh.train_recoder(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_pairs=10,
categorical_recoding_method="part label",
numerical_recoding_method="part label",
).. autofunction:: train_recoder_mt_flatten
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
"kh_samples", "train_recoder_mt_flatten", "AnalysisResults.khj"
)
# Train the recoder. Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
# - The maximum number of aggregate variables to construct (1000)
# - To keep all the created variables independently of their informativeness (level)
# - To not recode the variables values
kh.train_recoder(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_constructed_variables=1000,
informative_variables_only=False,
categorical_recoding_method="none",
numerical_recoding_method="none",
keep_initial_categorical_variables=True,
keep_initial_numerical_variables=True,
).. autofunction:: deploy_model
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_model")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")
# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
)
# Deploy the model on the database
# It will score it according to the trained predictor
kh.deploy_model(
model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path
).. autofunction:: deploy_model_text
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
)
data_table_path = os.path.join(
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
output_dir = os.path.join("kh_samples", "deploy_model_text")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")
# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"FlightNegativeTweets",
data_table_path,
"negativereason",
report_file_path,
max_trees=5,
max_text_features=1000,
text_features="words",
)
# Deploy the model on the database
# It will score it according to the trained predictor
kh.deploy_model(
model_dictionary_file_path,
"SNB_FlightNegativeTweets",
data_table_path,
output_data_table_path,
).. autofunction:: deploy_model_mt
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt")
# Train the predictor (see train_predictor_mt for details)
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_trees=0,
)
# Deploy the model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
model_dictionary_file_path,
"SNB_Accident",
accidents_table_path,
output_data_table_path,
additional_data_tables={"Vehicles": vehicles_table_path},
).. autofunction:: deploy_model_mt_with_interpretation
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt_with_interpretation")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic")
output_data_table_path = os.path.join(output_dir, "InterpretedAccidents.txt")
# Train the predictor (see train_predictor_mt for details)
# Add max_evaluated_variables so that an interpretation model can be built
# (see https://github.com/KhiopsML/khiops/issues/577)
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_trees=0,
max_evaluated_variables=10,
)
# Interpret the predictor
kh.interpret_predictor(
model_dictionary_file_path,
"SNB_Accident",
interpretor_file_path,
max_variable_importances=3,
importance_ranking="Individual",
)
# Deploy the interpretation model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
interpretor_file_path,
"Interpretation_SNB_Accident",
accidents_table_path,
output_data_table_path,
additional_data_tables={"Vehicles": vehicles_table_path},
).. autofunction:: deploy_reinforced_model_mt
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_reinforced_model_mt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
reinforced_predictor_file_path = os.path.join(output_dir, "ReinforcedModel.kdic")
output_data_table_path = os.path.join(output_dir, "ReinforcedAccidents.txt")
# Train the predictor (see train_predictor_mt for details)
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={"Vehicles": vehicles_table_path},
max_trees=0,
)
# Reinforce the predictor
kh.reinforce_predictor(
model_dictionary_file_path,
"SNB_Accident",
reinforced_predictor_file_path,
reinforcement_target_value="NonLethal",
reinforcement_lever_variables=["InAgglomeration", "CollisionType"],
)
# Deploy the reinforced model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
reinforced_predictor_file_path,
"Reinforcement_SNB_Accident",
accidents_table_path,
output_data_table_path,
additional_data_tables={"Vehicles": vehicles_table_path},
).. autofunction:: deploy_model_mt_snowflake
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
users_table_path = os.path.join(accidents_dir, "Users.txt")
places_table_path = os.path.join(accidents_dir, "Places.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt")
# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Accident",
accidents_table_path,
"Gravity",
report_file_path,
additional_data_tables={
"Vehicles": vehicles_table_path,
"Vehicles/Users": users_table_path,
"Place": places_table_path,
},
max_trees=0,
)
# Deploy the model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
model_dictionary_file_path,
"SNB_Accident",
accidents_table_path,
output_data_table_path,
additional_data_tables={
"Vehicles": vehicles_table_path,
"Vehicles/Users": users_table_path,
"Place": places_table_path,
},
).. autofunction:: deploy_model_expert
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_model_expert")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")
# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
)
# Read the dictionary file to obtain an instance of class Dictionary
model_domain = kh.read_dictionary_file(model_dictionary_file_path)
snb_dictionary = model_domain.get_dictionary("SNB_Adult")
# Select Label (identifier)
snb_dictionary.get_variable("Label").used = True
# Select the variables containing the probabilities for each class
for variable in snb_dictionary.variables:
# The variable must have a meta data with key that start with "target_prob"
for key in variable.meta_data.keys:
if key.startswith("TargetProb"):
variable.used = True
# Deploy the model. Besides the mandatory parameters, it is specified:
# - A DictionaryDomain object to use instead of the mandatory dictionary file
kh.deploy_model(model_domain, "SNB_Adult", data_table_path, output_data_table_path).. autofunction:: deploy_classifier_for_metrics
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")
# Train the classifier for the target "class"
_, modeling_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
)
# Obtain the scores of the SNB on the test dataset to calculate the PR curve
kh.deploy_predictor_for_metrics(
modeling_dictionary_file_path,
"SNB_Adult",
data_table_path,
output_data_table_path,
sampling_mode="Exclude sample",
output_header_line=False,
)
# We estimate the precision/recall for the class "more" and increasing thresholds
# Note: Normally one would do this with a package (eg. sklearn.metrics)
thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
true_positives = {thres: 0 for thres in thresholds}
false_positives = {thres: 0 for thres in thresholds}
false_negatives = {thres: 0 for thres in thresholds}
with open(output_data_table_path) as output_data_table:
for line in output_data_table:
fields = line.split("\t")
true_target = fields[0]
proba_more = float(fields[3])
for thres in thresholds:
if true_target == "more" and proba_more >= thres:
true_positives[thres] += 1
elif true_target == "more" and proba_more < thres:
false_negatives[thres] += 1
elif true_target == "less" and proba_more >= thres:
false_positives[thres] += 1
precision = {
thres: true_positives[thres] / (true_positives[thres] + false_positives[thres])
for thres in thresholds
}
recall = {
thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres])
for thres in thresholds
}
# Print the curve at the selected points
print("Precision and Recall for class 'more'")
print("threshold\trecall\tprecision")
thresholds.reverse()
for thres in thresholds:
print(str(thres) + "\t" + str(recall[thres]) + "\t" + str(precision[thres])).. autofunction:: deploy_regressor_for_metrics
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TrueAndPredictedAges.txt")
# Train the regressor for the target "age" (with 20% train to be quick)
_, modeling_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"age",
report_file_path,
sample_percentage=20,
max_trees=0,
)
# Obtain the predicted regression values of the SNB on the test dataset estimate R2
kh.deploy_predictor_for_metrics(
modeling_dictionary_file_path,
"SNB_Adult",
data_table_path,
output_data_table_path,
sample_percentage=20,
sampling_mode="Exclude sample",
output_header_line=False,
)
# Estimate R2
# Note: Normally one would do this with a package (eg. sklearn.metrics)
# First pass to estimate sums of residuals and the mean
ss_res = 0
mean = 0
n_instances = 0
with open(output_data_table_path) as output_data_table:
for line in output_data_table:
fields = line.split("\t")
true_target = float(fields[0])
predicted_target = float(fields[1])
ss_res += (true_target - predicted_target) ** 2
mean += true_target
n_instances += 1
mean /= n_instances
# Second pass to estimate the total sums of squares and finish the R2 estimation
ss_tot = 0
with open(output_data_table_path) as output_data_table:
for line in output_data_table:
fields = line.split("\t")
true_target = float(fields[0])
ss_tot += (true_target - mean) ** 2
r2_score = 1 - ss_res / ss_tot
# Print results
print("Adult 'age' regression (30% train)")
print(f"R2 (explained variance) = {r2_score}").. autofunction:: sort_data_table
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
output_data_table_path = os.path.join(
"kh_samples",
"sort_data_table",
"SortedAccidents.txt",
)
# Sort table
kh.sort_data_table(
dictionary_file_path, "Accident", accidents_table_path, output_data_table_path
).. autofunction:: sort_data_table_expert
# Imports
import os
from khiops import core as kh
# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_data_table_path = os.path.join(
"kh_samples", "sort_data_table_expert", "SortedVehicles.txt"
)
# Sort table. Besides the mandatory parameters, it is specified:
# - A list containing the sorting fields
kh.sort_data_table(
dictionary_file_path,
"Vehicle",
vehicles_table_path,
output_data_table_path,
sort_variables=["AccidentId", "VehicleId"],
).. autofunction:: extract_keys_from_data_table
# Imports
import os
from khiops import core as kh
# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_data_table_path = os.path.join(
"kh_samples",
"extract_keys_from_data_table",
"KeysSpliceJunction.txt",
)
# Extract keys from table "SpliceJunctionDNA" to the output table
kh.extract_keys_from_data_table(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
output_data_table_path,
).. autofunction:: train_coclustering
# Imports
import os
from khiops import core as kh
# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
coclustering_report_path = os.path.join(
"kh_samples", "train_coclustering", "CoclusteringResults.khcj"
)
# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
["SampleId", "Char"],
coclustering_report_path,
)
print(f"Coclustering report file available at {coclustering_report_path}")
# If you have Khiops Co-Visualization installed you may open the report as follows
# kh.visualize_report(coclustering_report_path).. autofunction:: train_instance_variable_coclustering
# Imports
import os
from khiops import core as kh
# Set the file paths
iris_dir = os.path.join(kh.get_samples_dir(), "Iris")
dictionary_file_path = os.path.join(iris_dir, "Iris.kdic")
data_table_path = os.path.join(iris_dir, "Iris.txt")
coclustering_report_path = os.path.join(
"kh_samples",
"train_instance_variable_coclustering",
"CoclusteringResults.khcj",
)
# Train a coclustering model for variables "SampleId" and "Char"
kh.train_instance_variable_coclustering(
dictionary_file_path,
"Iris",
data_table_path,
coclustering_report_path,
)
print(
"Instance-variable coclustering report file available "
f"at {coclustering_report_path}"
)
# If you have Khiops Co-Visualization installed you may open the report as follows
# kh.visualize_report(coclustering_report_path).. autofunction:: simplify_coclustering
# Imports
import os
from khiops import core as kh
# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "simplify_coclustering")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
simplified_coclustering_file_path = os.path.join(
output_dir, "simplified_coclustering.khcj"
)
# Train coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
["SampleId", "Char"],
coclustering_file_path,
)
# Simplify the trained coclustering with the constraints
# - maximum information preserved: 80%
# - maximum total parts number: 4
kh.simplify_coclustering(
coclustering_file_path,
simplified_coclustering_file_path,
max_preserved_information=80,
max_total_parts=4,
).. autofunction:: extract_clusters
# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "extract_clusters")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
clusters_file_path = os.path.join(output_dir, "extracted_clusters.txt")
# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
["SampleId", "Char"],
coclustering_file_path,
)
# Extract clusters
kh.extract_clusters(coclustering_file_path, "Char", clusters_file_path).. autofunction:: deploy_coclustering
# Imports
import os
from khiops import core as kh
# Set the initial file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
output_dir = os.path.join("kh_samples", "deploy_coclustering")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
coclustering_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic")
output_data_table_path = os.path.join(output_dir, "DeployedSpliceJunctionDNA.txt")
# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
["SampleId", "Char"],
coclustering_file_path,
)
# Deploy "Char" clusters in the training database
kh.deploy_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
data_table_path,
coclustering_file_path,
["SampleId"],
"Char",
coclustering_dictionary_file_path,
output_data_table_path,
header_line=True,
).. autofunction:: deploy_coclustering_expert
# Imports
import os
from khiops import core as kh
# Set the initial file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunction.txt")
secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "deploy_coclustering_expert")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
# Train a coclustering model for variables "SampleId" and "Char"
print("train coclustering on SpliceJunctionDNA")
kh.train_coclustering(
dictionary_file_path,
"SpliceJunctionDNA",
secondary_data_table_path,
["SampleId", "Char"],
coclustering_file_path,
)
print("prepare_coclustering_deployment")
# The input dictionary is extended with new coclustering based variables
augmented_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic")
kh.prepare_coclustering_deployment(
dictionary_file_path,
"SpliceJunction",
coclustering_file_path,
"DNA",
"SampleId",
augmented_dictionary_file_path,
)
print("prepare_coclustering_deployment with at most two clusters")
# Extend the already extended dictionary with the new variables from a simplified CC
reaugmented_dictionary_file_path = os.path.join(
output_dir, "ReaugmentedCoclustering.kdic"
)
kh.prepare_coclustering_deployment(
augmented_dictionary_file_path,
"SpliceJunction",
coclustering_file_path,
"DNA",
"SampleId",
reaugmented_dictionary_file_path,
variables_prefix="C2_",
max_part_numbers={"SampleId": 2},
)
output_data_table_path = os.path.join(output_dir, "TransferredSpliceJunction.txt")
# Deploy the coclustering with the extended dictionary
print("deploy_model with the new coclustering based variables")
kh.deploy_model(
reaugmented_dictionary_file_path,
"SpliceJunction",
data_table_path,
output_data_table_path,
additional_data_tables={"DNA": secondary_data_table_path},
)
deployed_dictionary_file_path = os.path.join(
output_dir, "Transferred_Coclustering.kdic"
)
print("build_deployed_dictionary to get the new dictionary")
kh.build_deployed_dictionary(
reaugmented_dictionary_file_path,
"SpliceJunction",
deployed_dictionary_file_path,
).. autofunction:: scenario_prologue
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
"kh_samples", "scenario_prologue", "AnalysisResults.khj"
)
# Set the maximum memory "by hand" with an scenario prologue
scenario_prologue = """
// Max memory 2000 mb
AnalysisSpec.SystemParameters.MemoryLimit 2000
"""
# Train the predictor
kh.train_predictor(
dictionary_file_path,
"Adult",
data_table_path,
"class",
report_file_path,
max_trees=0,
scenario_prologue=scenario_prologue,
).. autofunction:: build_deployed_dictionary
# Imports
import os
from khiops import core as kh
# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
output_dir = os.path.join("kh_samples", "build_deployed_dictionary")
deployed_dictionary_file_path = os.path.join(output_dir, "SNB_Iris_deployed.kdic")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
# Train the predictor
_, modeling_dictionary_file_path = kh.train_predictor(
dictionary_file_path,
"Iris",
data_table_path,
"Class",
report_file_path,
max_trees=0,
)
# Build the dictionary to read the output of the predictor dictionary file
# It will contain the columns of the table generated by deploying the model
kh.build_deployed_dictionary(
modeling_dictionary_file_path,
"SNB_Iris",
deployed_dictionary_file_path,
)
# Print the deployed dictionary
with open(deployed_dictionary_file_path) as deployed_dictionary_file:
for line in deployed_dictionary_file:
print(line, end="")