khiops-python/doc/samples/samples.rst at a4ef9e5eac6d540e82c004493e49d579f30fefd3 · KhiopsML/khiops-python

orphan:

.. currentmodule:: samples

Samples core

The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.core` module.

Script and Jupyter notebook

The samples in this page are also available as:

:download:`Python script <../../khiops/samples/samples.py>`
:download:`Jupyter notebook <../../khiops/samples/samples.ipynb>`

Setup

First make sure you have installed the sample datasets. In a configured conda shell (ex. Anaconda Prompt in Windows) execute:

kh-download-datasets

If that doesn't work open a python console and execute:

from khiops.tools import download_datasets
download_datasets()

Samples

.. autofunction:: get_khiops_version

print(f"Khiops version: {kh.get_khiops_version()}")

.. autofunction:: build_dictionary_from_data_table

# Imports
import os
from khiops import core as kh

# Set the file paths
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
dictionary_name = "AutoAdult"
dictionary_file_path = os.path.join(
    "kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic"
)

# Create the dictionary from the data table
kh.build_dictionary_from_data_table(
    data_table_path, dictionary_name, dictionary_file_path
)

.. autofunction:: create_dictionary_domain

# Imports
import os
from khiops import core as kh

# Create a Root dictionary
root_dictionary = kh.Dictionary(
    json_data={"name": "dict_from_scratch", "root": True, "key": ["Id"]}
)

# Start with simple variables to declare
simple_variables = [
    {"name": "Id", "type": "Categorical"},
    {"name": "Num", "type": "Numerical"},
    {"name": "text", "type": "Text"},
    {"name": "hour", "type": "Time"},
    {"name": "date", "type": "Date"},
    {"name": "ambiguous_ts", "type": "Timestamp"},
    {"name": "ts", "type": "TimestampTZ"},
]
for var_spec in simple_variables:
    var = kh.Variable()
    var.name = var_spec["name"]
    var.type = var_spec["type"]
    root_dictionary.add_variable(var)

# Create a second dictionary
second_dictionary = kh.Dictionary(
    json_data={"name": "Service", "key": ["Id", "id_product"]}
)
second_dictionary.add_variable(
    kh.Variable(json_data={"name": "Id", "type": "Categorical"})
)
second_dictionary.add_variable(
    kh.Variable(json_data={"name": "id_product", "type": "Categorical"})
)
# Create a third dictionary
third_dictionary = kh.Dictionary(json_data={"name": "Address", "key": ["Id"]})
third_dictionary.add_variable(
    kh.Variable(json_data={"name": "StreetNumber", "type": "Numerical"})
)
third_dictionary.add_variable(
    kh.Variable(json_data={"name": "StreetName", "type": "Categorical"})
)
third_dictionary.add_variable(
    kh.Variable(json_data={"name": "id_city", "type": "Categorical"})
)

# Add the variables used in a multi-table context in the first dictionary.
# They link the root dictionary to the additional ones
root_dictionary.add_variable(
    kh.Variable(json_data={"name": "Services", "type": "Table(Service)"})
)
root_dictionary.add_variable(
    kh.Variable(json_data={"name": "Address", "type": "Entity(Address)"})
)

# Create a DictionaryDomain (set of dictionaries)
dictionary_domain = kh.DictionaryDomain()
dictionary_domain.add_dictionary(root_dictionary)
dictionary_domain.add_dictionary(second_dictionary)
dictionary_domain.add_dictionary(third_dictionary)

output_dir = os.path.join("kh_samples", "create_dictionary_domain")
dictionary_file_path = os.path.join(output_dir, "dict_from_scratch.kdic")

# Create the output directory if needed
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Write the dictionary domain to a file
dictionary_domain.export_khiops_dictionary_file(dictionary_file_path)

.. autofunction:: detect_data_table_format

# Imports
import os
from khiops import core as kh

# Set the file paths
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
output_dir = os.path.join("kh_samples", "detect_data_table_format")
transformed_data_table_path = os.path.join(output_dir, "AdultWithAnotherFormat.txt")

# Create the output directory
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Detect the format of the table
format_spec = kh.detect_data_table_format(data_table_path)
print("Format specification (header_line, field_separator)")
print("Format detected on original table:", format_spec)

# Make a deployment to change the format of the data table
kh.deploy_model(
    dictionary_file_path,
    "Adult",
    data_table_path,
    transformed_data_table_path,
    output_header_line=False,
    output_field_separator=",",
)

# Detect the new format of the table without a dictionary file
format_spec = kh.detect_data_table_format(transformed_data_table_path)
print("Format detected on reformatted table:", format_spec)

# Detect the new format of the table with a dictionary file
format_spec = kh.detect_data_table_format(
    transformed_data_table_path,
    dictionary_file_path_or_domain=dictionary_file_path,
    dictionary_name="Adult",
)
print("Format detected (with dictionary file) on reformatted table:", format_spec)

.. autofunction:: check_database

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
log_file = os.path.join("kh_samples", "check_database", "check_database.log")

# Check the database
kh.check_database(
    dictionary_file_path,
    "Adult",
    data_table_path,
    log_file_path=log_file,
    max_messages=50,
)

.. autofunction:: export_dictionary_files

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
output_dir = os.path.join("kh_samples", "export_dictionary_files")
output_dictionary_file_path = os.path.join(output_dir, "ModifiedAdult.kdic")
output_dictionary_json_path = os.path.join(output_dir, "ModifiedAdult.kdicj")
alt_output_dictionary_json_path = os.path.join(output_dir, "AltModifiedAdult.kdicj")

# Load the dictionary domain from initial dictionary file
# Then obtain the "Adult" dictionary within
domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = domain.get_dictionary("Adult")

# Set some of its variables to unused
fnlwgt_variable = dictionary.get_variable("fnlwgt")
fnlwgt_variable.used = False
label_variable = dictionary.get_variable("Label")
label_variable.used = False

# Create output directory if necessary
if not os.path.exists("kh_samples"):
    os.mkdir("kh_samples")
    os.mkdir(output_dir)
else:
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

# Export to kdic
domain.export_khiops_dictionary_file(output_dictionary_file_path)

# Export to kdicj either from the domain or from a kdic file
# Requires a Khiops execution, that's why it is not a method of DictionaryDomain
kh.export_dictionary_as_json(domain, output_dictionary_json_path)
kh.export_dictionary_as_json(
    output_dictionary_file_path, alt_output_dictionary_json_path
)

.. autofunction:: train_predictor

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
analysis_report_file_path = os.path.join(
    "kh_samples", "train_predictor", "AnalysisReport.khj"
)

# Train the predictor
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    analysis_report_file_path,
    max_trees=0,
)

.. autofunction:: train_predictor_file_paths

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples", "train_predictor_file_paths", "AnalysisResults.khj"
)

# Train the predictor
_, modeling_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
)
print("Reports file available at " + report_file_path)
print("Modeling dictionary file available at " + modeling_dictionary_file_path)

# If you have Khiops Visualization installed you may open the report as follows
# kh.visualize_report(report_file_path)

.. autofunction:: train_predictor_text

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(
    kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
)
data_table_path = os.path.join(
    kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
report_file_path = os.path.join(
    "kh_samples", "train_predictor_text", "AnalysisResults.khj"
)

# Train the predictor
kh.train_predictor(
    dictionary_file_path,
    "FlightNegativeTweets",
    data_table_path,
    "negativereason",
    report_file_path,
    max_trees=5,
    max_text_features=1000,
    text_features="words",
)

.. autofunction:: train_predictor_error_handling

# Imports
import os
from khiops import core as kh

# Set the file paths with a nonexistent dictionary file
dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic"
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_error_handling")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
log_file_path = os.path.join(output_dir, "khiops.log")
scenario_path = os.path.join(output_dir, "scenario._kh")

# Train the predictor and handle the error
try:
    kh.train_predictor(
        dictionary_file_path,
        "Adult",
        data_table_path,
        "class",
        report_file_path,
        trace=True,
        log_file_path=log_file_path,
        output_scenario_path=scenario_path,
    )
except kh.KhiopsRuntimeError as error:
    print("Khiops training failed! Below the KhiopsRuntimeError message:")
    print(error)

print("\nFull log contents:")
print("------------------")
with open(log_file_path) as log_file:
    for line in log_file:
        print(line, end="")

print("\nExecuted scenario")
print("-----------------")
with open(scenario_path) as scenario_file:
    for line in scenario_file:
        print(line, end="")

.. autofunction:: train_predictor_mt

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
    "kh_samples", "train_predictor_mt", "AnalysisResults.khj"
)

# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_trees=0,
)

.. autofunction:: train_predictor_mt_with_specific_rules

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
    "kh_samples",
    "train_predictor_mt_with_specific_rules",
    "AnalysisResults.khj",
)

# Train the predictor. Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
# - The maximum number of aggregate variables to construct (1000)
# - The construction rules allowed to automatically create aggregates
# - To not construct any decision tree
kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_constructed_variables=1000,
    construction_rules=["TableMode", "TableSelection"],
    max_trees=0,
)

.. autofunction:: train_predictor_mt_snowflake

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
users_table_path = os.path.join(accidents_dir, "Users.txt")
places_table_path = os.path.join(accidents_dir, "Places.txt")
report_file_path = os.path.join(
    "kh_samples", "train_predictor_mt_snowflake", "AnalysisResults.khj"
)

# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={
        "Vehicles": vehicles_table_path,
        "Vehicles/Users": users_table_path,
        "Place": places_table_path,
    },
    max_trees=0,
)

.. autofunction:: train_predictor_with_train_percentage

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples",
    "train_predictor_with_train_percentage",
    "P90_AnalysisResults.khj",
)

# Train the predictor. Besides the mandatory parameters, it is specified:
# - A 90% sampling rate for the training dataset
# - Set the test dataset as the complement of the training dataset (10%)
# - No trees
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    sample_percentage=90,
    use_complement_as_test=True,
    max_trees=0,
)

.. autofunction:: train_predictor_with_trees

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt")
report_file_path = os.path.join(
    "kh_samples", "train_predictor_with_trees", "P80_AnalysisResults.khj"
)

# Train the predictor with at most 15 trees (default 10)
kh.train_predictor(
    dictionary_file_path,
    "Letter",
    data_table_path,
    "lettr",
    report_file_path,
    sample_percentage=80,
    use_complement_as_test=True,
    max_trees=15,
)

.. autofunction:: train_predictor_with_pairs

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples", "train_predictor_with_pairs", "AnalysisResults.khj"
)

# Train the predictor with at most 10 pairs as follows:
# - Include pairs age-race and capital_gain-capital_loss
# - Include all possible pairs having relationship as component
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    use_complement_as_test=True,
    max_trees=0,
    max_pairs=10,
    specific_pairs=[
        ("age", "race"),
        ("capital_gain", "capital_loss"),
        ("relationship", ""),
    ],
)

.. autofunction:: train_predictor_with_multiple_parameters

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_script_path = os.path.join(output_dir, "output_scenario._kh")
log_path = os.path.join(output_dir, "log.txt")

# Train the predictor. Besides the mandatory parameters, we specify:
# - The value "more" as main target value
# - The output Khiops script file location (generic)
# - The log file location (generic)
# - The maximum memory used, set to 1000 MB
# - To show the debug trace (generic)
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    main_target_value="more",
    output_scenario_path=output_script_path,
    log_file_path=log_path,
    memory_limit_mb=1000,
    trace=True,
)

.. autofunction:: train_predictor_detect_format

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
output_dir = os.path.join("kh_samples", "train_predictor_detect_format")
transformed_data_table_path = os.path.join(output_dir, "TransformedIris.txt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")

# Transform the database format from header_line=True and field_separator=TAB
# to header_line=False and field_separator=","
# See the deploy_model examples below for more details
kh.deploy_model(
    dictionary_file_path,
    "Iris",
    data_table_path,
    transformed_data_table_path,
    output_header_line=False,
    output_field_separator=",",
)

# Try to learn with the old format
try:
    kh.train_predictor(
        dictionary_file_path,
        "Iris",
        transformed_data_table_path,
        "Class",
        report_file_path,
        header_line=True,
        field_separator="",
    )
except kh.KhiopsRuntimeError as error:
    print(
        "This failed because of a bad data table format spec. "
        + "Below the KhiopsRuntimeError message"
    )
    print(error)

# Train without specifyng the format (detect_format is True by default)
kh.train_predictor(
    dictionary_file_path,
    "Iris",
    transformed_data_table_path,
    "Class",
    report_file_path,
)

.. autofunction:: train_predictor_with_cross_validation

# Imports
import math
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation")
fold_dictionary_file_path = os.path.join(output_dir, "AdultWithFolding.kdic")

# Create the output directory
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# Load the learning dictionary object
domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = domain.get_dictionary("Adult")

# Add a random fold index variable to the learning dictionary
fold_number = 5
fold_index_variable = kh.Variable()
fold_index_variable.name = "FoldIndex"
fold_index_variable.type = "Numerical"
fold_index_variable.used = False
dictionary.add_variable(fold_index_variable)

# Create fold indexing rule and set it on `fold_index_variable`
dictionary.get_variable(fold_index_variable.name).set_rule(
    kh.Rule("Ceil", kh.Rule("Product", fold_number, kh.Rule("Random()"))),
)

# Add variables that indicate if the instance is in the train dataset:
for fold_index in range(1, fold_number + 1):
    is_in_train_dataset_variable = kh.Variable()
    is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index)
    is_in_train_dataset_variable.type = "Numerical"
    is_in_train_dataset_variable.used = False
    dictionary.add_variable(is_in_train_dataset_variable)
    dictionary.get_variable(is_in_train_dataset_variable.name).set_rule(
        kh.Rule("NEQ", fold_index_variable, fold_index),
    )

# Print dictionary with fold variables
print("Dictionary file with fold variables")
domain.export_khiops_dictionary_file(fold_dictionary_file_path)
with open(fold_dictionary_file_path) as fold_dictionary_file:
    for line in fold_dictionary_file:
        print(line, end="")

# For each fold k:
print("Training Adult with " + str(fold_number) + " folds")
print("\tfold\ttrain auc\ttest auc")
train_aucs = []
test_aucs = []
for fold_index in range(1, fold_number + 1):
    analysis_report_file_path = os.path.join(
        output_dir, "Fold" + str(fold_index) + "AnalysisResults.khj"
    )
    # Train a model from the sub-dataset where IsInTrainDataset<k> is 1
    _, modeling_dictionary_file_path = kh.train_predictor(
        domain,
        "Adult",
        data_table_path,
        "class",
        analysis_report_file_path,
        sample_percentage=100,
        selection_variable="IsInTrainDataset" + str(fold_index),
        selection_value=1,
        max_trees=0,
    )

    evaluation_report_file_path = os.path.join(
        output_dir, "Fold" + str(fold_index) + "AdultEvaluationResults.khj"
    )
    # Evaluate the resulting model in the subsets where IsInTrainDataset is 0
    test_evaluation_report_path = kh.evaluate_predictor(
        modeling_dictionary_file_path,
        "SNB_Adult",
        data_table_path,
        evaluation_report_file_path,
        sample_percentage=100,
        selection_variable="IsInTrainDataset" + str(fold_index),
        selection_value=0,
    )

    # Obtain the train AUC from the train report and the test AUC from the
    # evaluation report and print them
    train_results = kh.read_analysis_results_file(analysis_report_file_path)
    test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path)
    train_auc = train_results.train_evaluation_report.get_snb_performance().auc
    test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc
    print("\t" + str(fold_index) + "\t" + str(train_auc) + "\t" + str(test_auc))

    # Store the train and test AUCs in arrays
    train_aucs.append(train_auc)
    test_aucs.append(test_auc)

# Print the mean +- error aucs for both train and test
mean_train_auc = sum(train_aucs) / fold_number
squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs]
sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1))

mean_test_auc = sum(test_aucs) / fold_number
squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs]
sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1))

print("final auc")
print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc))
print("test  auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc))

.. autofunction:: interpret_predictor

# Imports
import os
from khiops import core as kh

dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "interpret_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic")

# Build prediction model
_, predictor_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    analysis_report_file_path,
)

# Build interpretation model
kh.interpret_predictor(predictor_file_path, "SNB_Adult", interpretor_file_path)

print(f"The interpretation model is '{interpretor_file_path}'")

.. autofunction:: reinforce_predictor

# Imports
import os
from khiops import core as kh

dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "reinforce_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
reinforced_predictor_file_path = os.path.join(output_dir, "ReinforcedAdultModel.kdic")

# Build prediction model
_, predictor_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    analysis_report_file_path,
)

# Build reinforced predictor
kh.reinforce_predictor(
    predictor_file_path,
    "SNB_Adult",
    reinforced_predictor_file_path,
    reinforcement_lever_variables=["occupation"],
)

print(f"The reinforced predictor is '{reinforced_predictor_file_path}'")

.. autofunction:: multiple_train_predictor

# Imports
import os
from khiops import core as kh


def display_test_results(json_result_file_path):
    """Display some of the training results"""
    results = kh.read_analysis_results_file(json_result_file_path)
    train_performance = results.train_evaluation_report.get_snb_performance()
    test_performance = results.test_evaluation_report.get_snb_performance()
    print(
        "\t"
        + str(len(results.preparation_report.variables_statistics))
        + "\t"
        + str(train_performance.auc)
        + "\t"
        + str(test_performance.auc)
    )


# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "multiple_train_predictor")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")

# Read the dictionary file to obtain an instance of class Dictionary
dictionary_domain = kh.read_dictionary_file(dictionary_file_path)
dictionary = dictionary_domain.get_dictionary("Adult")

# Train a SNB model using all the variables
print("\t#vars\ttrain auc\ttest auc")
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    sample_percentage=70,
    use_complement_as_test=True,
    max_trees=0,
)
display_test_results(report_file_path)

# Read results to obtain the variables sorted by decreasing Level
analysis_results = kh.read_analysis_results_file(report_file_path)
preparation_results = analysis_results.preparation_report

# Train a sequence of models with a decreasing number of variables
# We disable variables one-by-one in increasing level (predictive power) order
variable_number = len(preparation_results.variables_statistics)
for i in reversed(range(variable_number)):
    # Search the next variable
    variable = preparation_results.variables_statistics[i]

    # Disable this variable and save the dictionary with the Khiops format
    dictionary.get_variable(variable.name).used = False

    # Train the model with this dictionary domain object
    report_file_path = os.path.join(
        output_dir, f"V{variable_number - 1 - i}_AnalysisResults.khj"
    )
    kh.train_predictor(
        dictionary_domain,
        "Adult",
        data_table_path,
        "class",
        report_file_path,
        sample_percentage=70,
        use_complement_as_test=True,
        max_trees=0,
    )

    # Show a preview of the results
    display_test_results(report_file_path)

.. autofunction:: evaluate_predictor

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "evaluate_predictor")
analysis_report_file_path = os.path.join(output_dir, "AnalysisResults.khj")

# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    analysis_report_file_path,
    max_trees=0,
)

evaluation_report_file_path = os.path.join(output_dir, "AdultEvaluationResults.khj")

# Evaluate the predictor
kh.evaluate_predictor(
    model_dictionary_file_path,
    "SNB_Adult",
    data_table_path,
    evaluation_report_file_path,
)
print("Evaluation report available at " + evaluation_report_file_path)

.. autofunction:: access_predictor_evaluation_report

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples", "access_predictor_evaluation_report", "AdultAnalysisReport.khj"
)

# Train the SNB predictor and some univariate predictors
# Note: Evaluation in test is 30% by default
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
)

# Obtain the evaluation results
results = kh.read_analysis_results_file(report_file_path)
evaluation_report = results.test_evaluation_report
snb_performance = evaluation_report.get_snb_performance()

# Print univariate metrics for the SNB
print("\nperformance metrics for " + snb_performance.name)
for metric_name in snb_performance.get_metric_names():
    print(metric_name + ": " + str(snb_performance.get_metric(metric_name)))

# Print the confusion matrix
print("\nconfusion matrix:")
confusion_matrix = snb_performance.confusion_matrix

for target_value in confusion_matrix.values:
    print("\t" + target_value, end="")
print("")

for i, target_value in enumerate(confusion_matrix.values):
    observed_frequencies = confusion_matrix.matrix[i]
    print(target_value, end="")
    for frequency in observed_frequencies:
        print("\t" + str(frequency), end="")
    print("")

# Print the head of the lift curves for the 'more' modality
print("\nfirst five values of the lift curves for 'more'")

snb_lift_curve = evaluation_report.get_snb_lift_curve("more")
optimal_lift_curve = evaluation_report.get_classifier_lift_curve("Optimal", "more")
random_lift_curve = evaluation_report.get_classifier_lift_curve("Random", "more")

for i in range(5):
    print(
        str(snb_lift_curve.values[i])
        + "\t"
        + str(optimal_lift_curve.values[i])
        + "\t"
        + str(random_lift_curve.values[i])
    )

# Print metrics for an SNB predictor
predictor_performance = evaluation_report.get_predictor_performance(
    "Selective Naive Bayes"
)
print("\n\nperformance metrics for " + predictor_performance.name)
for metric_name in predictor_performance.get_metric_names():
    print(metric_name + ": " + str(predictor_performance.get_metric(metric_name)))

.. autofunction:: train_recoder

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join("kh_samples", "train_recoder", "AnalysisResults.khj")

# Train the recoder model
kh.train_recoder(
    dictionary_file_path, "Adult", data_table_path, "class", report_file_path
)

.. autofunction:: train_recoder_with_multiple_parameters

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples",
    "train_recoder_with_multiple_parameters",
    "AnalysisResults.khj",
)

# Train the recoder model
kh.train_recoder(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_pairs=10,
    categorical_recoding_method="part label",
    numerical_recoding_method="part label",
)

.. autofunction:: train_recoder_mt_flatten

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
report_file_path = os.path.join(
    "kh_samples", "train_recoder_mt_flatten", "AnalysisResults.khj"
)

# Train the recoder. Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
# - The maximum number of aggregate variables to construct (1000)
# - To keep all the created variables independently of their informativeness (level)
# - To not recode the variables values
kh.train_recoder(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_constructed_variables=1000,
    informative_variables_only=False,
    categorical_recoding_method="none",
    numerical_recoding_method="none",
    keep_initial_categorical_variables=True,
    keep_initial_numerical_variables=True,
)

.. autofunction:: deploy_model

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_model")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")

# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
)

# Deploy the model on the database
# It will score it according to the trained predictor
kh.deploy_model(
    model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path
)

.. autofunction:: deploy_model_text

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(
    kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
)
data_table_path = os.path.join(
    kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
)
output_dir = os.path.join("kh_samples", "deploy_model_text")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")

# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "FlightNegativeTweets",
    data_table_path,
    "negativereason",
    report_file_path,
    max_trees=5,
    max_text_features=1000,
    text_features="words",
)

# Deploy the model on the database
# It will score it according to the trained predictor
kh.deploy_model(
    model_dictionary_file_path,
    "SNB_FlightNegativeTweets",
    data_table_path,
    output_data_table_path,
)

.. autofunction:: deploy_model_mt

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt")

# Train the predictor (see train_predictor_mt for details)
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_trees=0,
)

# Deploy the model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
    model_dictionary_file_path,
    "SNB_Accident",
    accidents_table_path,
    output_data_table_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
)

.. autofunction:: deploy_model_mt_with_interpretation

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt_with_interpretation")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
interpretor_file_path = os.path.join(output_dir, "InterpretationModel.kdic")
output_data_table_path = os.path.join(output_dir, "InterpretedAccidents.txt")

# Train the predictor (see train_predictor_mt for details)
# Add max_evaluated_variables so that an interpretation model can be built
# (see https://github.com/KhiopsML/khiops/issues/577)
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_trees=0,
    max_evaluated_variables=10,
)

# Interpret the predictor
kh.interpret_predictor(
    model_dictionary_file_path,
    "SNB_Accident",
    interpretor_file_path,
    max_variable_importances=3,
    importance_ranking="Individual",
)

# Deploy the interpretation model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
    interpretor_file_path,
    "Interpretation_SNB_Accident",
    accidents_table_path,
    output_data_table_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
)

.. autofunction:: deploy_reinforced_model_mt

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_dir = os.path.join("kh_samples", "deploy_reinforced_model_mt")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
reinforced_predictor_file_path = os.path.join(output_dir, "ReinforcedModel.kdic")
output_data_table_path = os.path.join(output_dir, "ReinforcedAccidents.txt")

# Train the predictor (see train_predictor_mt for details)
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
    max_trees=0,
)

# Reinforce the predictor
kh.reinforce_predictor(
    model_dictionary_file_path,
    "SNB_Accident",
    reinforced_predictor_file_path,
    reinforcement_target_value="NonLethal",
    reinforcement_lever_variables=["InAgglomeration", "CollisionType"],
)

# Deploy the reinforced model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
    reinforced_predictor_file_path,
    "Reinforcement_SNB_Accident",
    accidents_table_path,
    output_data_table_path,
    additional_data_tables={"Vehicles": vehicles_table_path},
)

.. autofunction:: deploy_model_mt_snowflake

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
users_table_path = os.path.join(accidents_dir, "Users.txt")
places_table_path = os.path.join(accidents_dir, "Places.txt")
output_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TransferredAccidents.txt")

# Train the predictor. Besides the mandatory parameters, we specify:
# - A python dictionary linking data paths to file paths for non-root tables
# - To not construct any decision tree
# The default number of automatic features is 100
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Accident",
    accidents_table_path,
    "Gravity",
    report_file_path,
    additional_data_tables={
        "Vehicles": vehicles_table_path,
        "Vehicles/Users": users_table_path,
        "Place": places_table_path,
    },
    max_trees=0,
)

# Deploy the model on the database
# Besides the mandatory parameters, it is specified:
# - A python dictionary linking data paths to file paths for non-root tables
kh.deploy_model(
    model_dictionary_file_path,
    "SNB_Accident",
    accidents_table_path,
    output_data_table_path,
    additional_data_tables={
        "Vehicles": vehicles_table_path,
        "Vehicles/Users": users_table_path,
        "Place": places_table_path,
    },
)

.. autofunction:: deploy_model_expert

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_model_expert")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")

# Train the predictor
_, model_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
)

# Read the dictionary file to obtain an instance of class Dictionary
model_domain = kh.read_dictionary_file(model_dictionary_file_path)
snb_dictionary = model_domain.get_dictionary("SNB_Adult")

# Select Label (identifier)
snb_dictionary.get_variable("Label").used = True

# Select the variables containing the probabilities for each class
for variable in snb_dictionary.variables:
    # The variable must have a meta data with key that start with "target_prob"
    for key in variable.meta_data.keys:
        if key.startswith("TargetProb"):
            variable.used = True

# Deploy the model. Besides the mandatory parameters, it is specified:
# - A DictionaryDomain object to use instead of the mandatory dictionary file
kh.deploy_model(model_domain, "SNB_Adult", data_table_path, output_data_table_path)

.. autofunction:: deploy_classifier_for_metrics

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "ScoresAdult.txt")

# Train the classifier for the target "class"
_, modeling_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
)
# Obtain the scores of the SNB on the test dataset to calculate the PR curve
kh.deploy_predictor_for_metrics(
    modeling_dictionary_file_path,
    "SNB_Adult",
    data_table_path,
    output_data_table_path,
    sampling_mode="Exclude sample",
    output_header_line=False,
)

# We estimate the precision/recall for the class "more" and increasing thresholds
# Note: Normally one would do this with a package (eg. sklearn.metrics)
thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
true_positives = {thres: 0 for thres in thresholds}
false_positives = {thres: 0 for thres in thresholds}
false_negatives = {thres: 0 for thres in thresholds}
with open(output_data_table_path) as output_data_table:
    for line in output_data_table:
        fields = line.split("\t")
        true_target = fields[0]
        proba_more = float(fields[3])
        for thres in thresholds:
            if true_target == "more" and proba_more >= thres:
                true_positives[thres] += 1
            elif true_target == "more" and proba_more < thres:
                false_negatives[thres] += 1
            elif true_target == "less" and proba_more >= thres:
                false_positives[thres] += 1

precision = {
    thres: true_positives[thres] / (true_positives[thres] + false_positives[thres])
    for thres in thresholds
}
recall = {
    thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres])
    for thres in thresholds
}

# Print the curve at the selected points
print("Precision and Recall for class 'more'")
print("threshold\trecall\tprecision")
thresholds.reverse()
for thres in thresholds:
    print(str(thres) + "\t" + str(recall[thres]) + "\t" + str(precision[thres]))

.. autofunction:: deploy_regressor_for_metrics

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
output_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
output_data_table_path = os.path.join(output_dir, "TrueAndPredictedAges.txt")

# Train the regressor for the target "age" (with 20% train to be quick)
_, modeling_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "age",
    report_file_path,
    sample_percentage=20,
    max_trees=0,
)

# Obtain the predicted regression values of the SNB on the test dataset estimate R2
kh.deploy_predictor_for_metrics(
    modeling_dictionary_file_path,
    "SNB_Adult",
    data_table_path,
    output_data_table_path,
    sample_percentage=20,
    sampling_mode="Exclude sample",
    output_header_line=False,
)
# Estimate R2
# Note: Normally one would do this with a package (eg. sklearn.metrics)
# First pass to estimate sums of residuals and the mean
ss_res = 0
mean = 0
n_instances = 0
with open(output_data_table_path) as output_data_table:
    for line in output_data_table:
        fields = line.split("\t")
        true_target = float(fields[0])
        predicted_target = float(fields[1])
        ss_res += (true_target - predicted_target) ** 2
        mean += true_target
        n_instances += 1
    mean /= n_instances

# Second pass to estimate the total sums of squares and finish the R2 estimation
ss_tot = 0
with open(output_data_table_path) as output_data_table:
    for line in output_data_table:
        fields = line.split("\t")
        true_target = float(fields[0])
        ss_tot += (true_target - mean) ** 2
r2_score = 1 - ss_res / ss_tot

# Print results
print("Adult 'age' regression (30% train)")
print(f"R2 (explained variance) = {r2_score}")

.. autofunction:: sort_data_table

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
accidents_table_path = os.path.join(accidents_dir, "Accidents.txt")
output_data_table_path = os.path.join(
    "kh_samples",
    "sort_data_table",
    "SortedAccidents.txt",
)

# Sort table
kh.sort_data_table(
    dictionary_file_path, "Accident", accidents_table_path, output_data_table_path
)

.. autofunction:: sort_data_table_expert

# Imports
import os
from khiops import core as kh

# Set the file paths
accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic")
vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt")
output_data_table_path = os.path.join(
    "kh_samples", "sort_data_table_expert", "SortedVehicles.txt"
)

# Sort table. Besides the mandatory parameters, it is specified:
# - A list containing the sorting fields
kh.sort_data_table(
    dictionary_file_path,
    "Vehicle",
    vehicles_table_path,
    output_data_table_path,
    sort_variables=["AccidentId", "VehicleId"],
)

.. autofunction:: extract_keys_from_data_table

# Imports
import os
from khiops import core as kh

# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_data_table_path = os.path.join(
    "kh_samples",
    "extract_keys_from_data_table",
    "KeysSpliceJunction.txt",
)

# Extract keys from table "SpliceJunctionDNA" to the output table
kh.extract_keys_from_data_table(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    output_data_table_path,
)

.. autofunction:: train_coclustering

# Imports
import os
from khiops import core as kh

# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
coclustering_report_path = os.path.join(
    "kh_samples", "train_coclustering", "CoclusteringResults.khcj"
)

# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    ["SampleId", "Char"],
    coclustering_report_path,
)
print(f"Coclustering report file available at {coclustering_report_path}")

# If you have Khiops Co-Visualization installed you may open the report as follows
# kh.visualize_report(coclustering_report_path)

.. autofunction:: train_instance_variable_coclustering

# Imports
import os
from khiops import core as kh

# Set the file paths
iris_dir = os.path.join(kh.get_samples_dir(), "Iris")
dictionary_file_path = os.path.join(iris_dir, "Iris.kdic")
data_table_path = os.path.join(iris_dir, "Iris.txt")
coclustering_report_path = os.path.join(
    "kh_samples",
    "train_instance_variable_coclustering",
    "CoclusteringResults.khcj",
)

# Train a coclustering model for variables "SampleId" and "Char"
kh.train_instance_variable_coclustering(
    dictionary_file_path,
    "Iris",
    data_table_path,
    coclustering_report_path,
)
print(
    "Instance-variable coclustering report file available "
    f"at {coclustering_report_path}"
)

# If you have Khiops Co-Visualization installed you may open the report as follows
# kh.visualize_report(coclustering_report_path)

.. autofunction:: simplify_coclustering

# Imports
import os
from khiops import core as kh

# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "simplify_coclustering")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
simplified_coclustering_file_path = os.path.join(
    output_dir, "simplified_coclustering.khcj"
)

# Train coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    ["SampleId", "Char"],
    coclustering_file_path,
)

# Simplify the trained coclustering with the constraints
# - maximum information preserved: 80%
# - maximum total parts number: 4
kh.simplify_coclustering(
    coclustering_file_path,
    simplified_coclustering_file_path,
    max_preserved_information=80,
    max_total_parts=4,
)

.. autofunction:: extract_clusters

# Set the file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "extract_clusters")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
clusters_file_path = os.path.join(output_dir, "extracted_clusters.txt")

# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    ["SampleId", "Char"],
    coclustering_file_path,
)

# Extract clusters
kh.extract_clusters(coclustering_file_path, "Char", clusters_file_path)

.. autofunction:: deploy_coclustering

# Imports
import os
from khiops import core as kh

# Set the initial file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
output_dir = os.path.join("kh_samples", "deploy_coclustering")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")
coclustering_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic")
output_data_table_path = os.path.join(output_dir, "DeployedSpliceJunctionDNA.txt")

# Train a coclustering model for variables "SampleId" and "Char"
kh.train_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    ["SampleId", "Char"],
    coclustering_file_path,
)

# Deploy "Char" clusters in the training database
kh.deploy_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    data_table_path,
    coclustering_file_path,
    ["SampleId"],
    "Char",
    coclustering_dictionary_file_path,
    output_data_table_path,
    header_line=True,
)

.. autofunction:: deploy_coclustering_expert

# Imports
import os
from khiops import core as kh

# Set the initial file paths
splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic")
data_table_path = os.path.join(splice_dir, "SpliceJunction.txt")
secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt")
output_dir = os.path.join("kh_samples", "deploy_coclustering_expert")
coclustering_file_path = os.path.join(output_dir, "Coclustering.khcj")

# Train a coclustering model for variables "SampleId" and "Char"
print("train coclustering on SpliceJunctionDNA")
kh.train_coclustering(
    dictionary_file_path,
    "SpliceJunctionDNA",
    secondary_data_table_path,
    ["SampleId", "Char"],
    coclustering_file_path,
)

print("prepare_coclustering_deployment")
# The input dictionary is extended with new coclustering based variables
augmented_dictionary_file_path = os.path.join(output_dir, "Coclustering.kdic")
kh.prepare_coclustering_deployment(
    dictionary_file_path,
    "SpliceJunction",
    coclustering_file_path,
    "DNA",
    "SampleId",
    augmented_dictionary_file_path,
)

print("prepare_coclustering_deployment with at most two clusters")
# Extend the already extended dictionary with the new variables from a simplified CC
reaugmented_dictionary_file_path = os.path.join(
    output_dir, "ReaugmentedCoclustering.kdic"
)
kh.prepare_coclustering_deployment(
    augmented_dictionary_file_path,
    "SpliceJunction",
    coclustering_file_path,
    "DNA",
    "SampleId",
    reaugmented_dictionary_file_path,
    variables_prefix="C2_",
    max_part_numbers={"SampleId": 2},
)

output_data_table_path = os.path.join(output_dir, "TransferredSpliceJunction.txt")

# Deploy the coclustering with the extended dictionary
print("deploy_model with the new coclustering based variables")
kh.deploy_model(
    reaugmented_dictionary_file_path,
    "SpliceJunction",
    data_table_path,
    output_data_table_path,
    additional_data_tables={"DNA": secondary_data_table_path},
)

deployed_dictionary_file_path = os.path.join(
    output_dir, "Transferred_Coclustering.kdic"
)
print("build_deployed_dictionary to get the new dictionary")
kh.build_deployed_dictionary(
    reaugmented_dictionary_file_path,
    "SpliceJunction",
    deployed_dictionary_file_path,
)

.. autofunction:: scenario_prologue

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
report_file_path = os.path.join(
    "kh_samples", "scenario_prologue", "AnalysisResults.khj"
)

# Set the maximum memory "by hand" with an scenario prologue
scenario_prologue = """
    // Max memory 2000 mb
    AnalysisSpec.SystemParameters.MemoryLimit 2000
    """

# Train the predictor
kh.train_predictor(
    dictionary_file_path,
    "Adult",
    data_table_path,
    "class",
    report_file_path,
    max_trees=0,
    scenario_prologue=scenario_prologue,
)

.. autofunction:: build_deployed_dictionary

# Imports
import os
from khiops import core as kh

# Set the file paths
dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic")
data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
output_dir = os.path.join("kh_samples", "build_deployed_dictionary")
deployed_dictionary_file_path = os.path.join(output_dir, "SNB_Iris_deployed.kdic")
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")

# Train the predictor
_, modeling_dictionary_file_path = kh.train_predictor(
    dictionary_file_path,
    "Iris",
    data_table_path,
    "Class",
    report_file_path,
    max_trees=0,
)

# Build the dictionary to read the output of the predictor dictionary file
# It will contain the columns of the table generated by deploying the model
kh.build_deployed_dictionary(
    modeling_dictionary_file_path,
    "SNB_Iris",
    deployed_dictionary_file_path,
)

# Print the deployed dictionary
with open(deployed_dictionary_file_path) as deployed_dictionary_file:
    for line in deployed_dictionary_file:
        print(line, end="")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Samples core

Script and Jupyter notebook

Setup

Samples

FilesExpand file tree

samples.rst

Latest commit

History

samples.rst

File metadata and controls

Samples core

Script and Jupyter notebook

Setup

Samples