From 1697c1e81a445bc2b08fc2b7414d6e8a02be1a77 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 28 Mar 2025 18:11:45 +0100 Subject: [PATCH 1/4] Update samples version to v11 in the CI --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b56f17d5..f88415a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,13 +1,13 @@ --- name: Tests env: - DEFAULT_SAMPLES_REVISION: 10.2.4 + DEFAULT_SAMPLES_REVISION: 11.0.0 DEFAULT_KHIOPS_DESKTOP_REVISION: 10.6.0-b.0 on: workflow_dispatch: inputs: samples-revision: - default: 10.2.4 + default: 11.0.0 description: Git Tag/Branch/Commit for the khiops-samples Repo image-tag: default: 10.6.0-b.0.0 From 74916d40bca3360460d3c119d89aaefbad686063 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Fri, 28 Mar 2025 18:12:12 +0100 Subject: [PATCH 2/4] Add Text and TextList support to DictionaryDomain related_to #330 --- khiops/core/dictionary.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/khiops/core/dictionary.py b/khiops/core/dictionary.py index 473997db..e90f76c6 100644 --- a/khiops/core/dictionary.py +++ b/khiops/core/dictionary.py @@ -70,7 +70,7 @@ def _format_name(name): def _quote_value(value): """Double-quotes a string - Categorical and metadata values are quoted with this method. + Categorical, Text and metadata values are quoted with this method. """ if isinstance(value, str): quoted_value = '"' + value.replace('"', '""') + '"' @@ -1075,7 +1075,15 @@ def is_native(self): ``True`` if a variables comes directly from a data column. """ - base_types = ["Categorical", "Numerical", "Time", "Date", "Timestamp"] + base_types = [ + "Categorical", + "Numerical", + "Time", + "Date", + "Timestamp", + "Text", + "TextList", + ] if self.variable_block is None: return self.rule == "" and self.type in base_types return self.variable_block.rule == "" From cb73a62aacd467fbf93f9bade5f65b5e20ca2fd5 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 1 Apr 2025 17:44:59 +0200 Subject: [PATCH 3/4] Add basic text features samples --- doc/samples/samples.rst | 67 ++++++++++++++++++++++++++ khiops/samples/samples.ipynb | 93 ++++++++++++++++++++++++++++++++++++ khiops/samples/samples.py | 78 ++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst index 2cc8a57a..23e7a4c6 100644 --- a/doc/samples/samples.rst +++ b/doc/samples/samples.rst @@ -212,6 +212,35 @@ Samples # If you have Khiops Visualization installed you may open the report as follows # kh.visualize_report(report_file_path) +.. autofunction:: train_predictor_text +.. code-block:: python + + # Imports + import os + from khiops import core as kh + + # Set the file paths + dictionary_file_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic" + ) + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + report_file_path = os.path.join( + "kh_samples", "train_predictor_text", "AnalysisResults.khj" + ) + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "FlightNegativeTweets", + data_table_path, + "negativereason", + report_file_path, + max_trees=5, + max_text_features=1000, + text_features="words", + ) .. autofunction:: train_predictor_error_handling .. code-block:: python @@ -948,6 +977,44 @@ Samples kh.deploy_model( model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path ) +.. autofunction:: deploy_model_text +.. code-block:: python + + # Imports + import os + from khiops import core as kh + + # Set the file paths + dictionary_file_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic" + ) + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + output_dir = os.path.join("kh_samples", "deploy_model_text") + report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt") + + # Train the predictor + _, model_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "FlightNegativeTweets", + data_table_path, + "negativereason", + report_file_path, + max_trees=5, + max_text_features=1000, + text_features="words", + ) + + # Deploy the model on the database + # It will score it according to the trained predictor + kh.deploy_model( + model_dictionary_file_path, + "SNB_FlightNegativeTweets", + data_table_path, + output_data_table_path, + ) .. autofunction:: deploy_model_mt .. code-block:: python diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index e4ed5f8c..f0b1feeb 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -278,6 +278,48 @@ "# kh.visualize_report(report_file_path)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_text()`\n\n", + "Trains a predictor with just text-specific parameters\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", + "\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(\n", + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n", + ")\n", + "data_table_path = os.path.join(\n", + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n", + ")\n", + "report_file_path = os.path.join(\n", + " \"kh_samples\", \"train_predictor_text\", \"AnalysisResults.khj\"\n", + ")\n", + "\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"FlightNegativeTweets\",\n", + " data_table_path,\n", + " \"negativereason\",\n", + " report_file_path,\n", + " max_trees=5,\n", + " max_text_features=1000,\n", + " text_features=\"words\",\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1248,6 +1290,57 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_model_text()`\n\n", + "Deploys a model learned on textual data\n It is a call to `~.api.deploy_model` with its mandatory parameters, plus\n text-specific parameters.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", + "\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(\n", + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n", + ")\n", + "data_table_path = os.path.join(\n", + " kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n", + ")\n", + "output_dir = os.path.join(\"kh_samples\", \"deploy_model_text\")\n", + "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n", + "output_data_table_path = os.path.join(output_dir, \"ScoresNegativeAirlineTweets.txt\")\n", + "\n", + "# Train the predictor\n", + "_, model_dictionary_file_path = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"FlightNegativeTweets\",\n", + " data_table_path,\n", + " \"negativereason\",\n", + " report_file_path,\n", + " max_trees=5,\n", + " max_text_features=1000,\n", + " text_features=\"words\",\n", + ")\n", + "\n", + "# Deploy the model on the database\n", + "# It will score it according to the trained predictor\n", + "kh.deploy_model(\n", + " model_dictionary_file_path,\n", + " \"SNB_FlightNegativeTweets\",\n", + " data_table_path,\n", + " output_data_table_path,\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py index 1cd9f3af..532bb4dc 100644 --- a/khiops/samples/samples.py +++ b/khiops/samples/samples.py @@ -233,6 +233,36 @@ def train_predictor_file_paths(): # kh.visualize_report(report_file_path) +def train_predictor_text(): + """Trains a predictor with just text-specific parameters""" + # Imports + import os + from khiops import core as kh + + # Set the file paths + dictionary_file_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic" + ) + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + report_file_path = os.path.join( + "kh_samples", "train_predictor_text", "AnalysisResults.khj" + ) + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "FlightNegativeTweets", + data_table_path, + "negativereason", + report_file_path, + max_trees=5, + max_text_features=1000, + text_features="words", + ) + + def train_predictor_error_handling(): """Shows how to handle errors when training a predictor @@ -1059,6 +1089,52 @@ def deploy_model(): ) +def deploy_model_text(): + """Deploys a model learned on textual data + It is a call to `~.api.deploy_model` with its mandatory parameters, plus + text-specific parameters. + + In this example, a Selective Naive Bayes (SNB) model is deployed by applying its + associated dictionary to the input database. The model predictions are written to + the output database. + """ + # Imports + import os + from khiops import core as kh + + # Set the file paths + dictionary_file_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic" + ) + data_table_path = os.path.join( + kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt" + ) + output_dir = os.path.join("kh_samples", "deploy_model_text") + report_file_path = os.path.join(output_dir, "AnalysisResults.khj") + output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt") + + # Train the predictor + _, model_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "FlightNegativeTweets", + data_table_path, + "negativereason", + report_file_path, + max_trees=5, + max_text_features=1000, + text_features="words", + ) + + # Deploy the model on the database + # It will score it according to the trained predictor + kh.deploy_model( + model_dictionary_file_path, + "SNB_FlightNegativeTweets", + data_table_path, + output_data_table_path, + ) + + def deploy_model_mt(): """Deploys a multi-table classifier in the simplest way possible @@ -1811,6 +1887,7 @@ def build_deployed_dictionary(): export_dictionary_files, train_predictor, train_predictor_file_paths, + train_predictor_text, train_predictor_error_handling, train_predictor_mt, train_predictor_mt_with_specific_rules, @@ -1829,6 +1906,7 @@ def build_deployed_dictionary(): train_recoder_with_multiple_parameters, train_recoder_mt_flatten, deploy_model, + deploy_model_text, deploy_model_mt, deploy_model_mt_with_interpretation, deploy_model_mt_snowflake, From 1d3d80d6dcd9dbd84a8fcd92a62fd90304d6aaa3 Mon Sep 17 00:00:00 2001 From: Popescu V <136721202+popescu-v@users.noreply.github.com> Date: Tue, 1 Apr 2025 17:21:40 +0200 Subject: [PATCH 4/4] Add TimestampTZ to the native types --- khiops/core/dictionary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/khiops/core/dictionary.py b/khiops/core/dictionary.py index e90f76c6..007f4d06 100644 --- a/khiops/core/dictionary.py +++ b/khiops/core/dictionary.py @@ -1081,6 +1081,7 @@ def is_native(self): "Time", "Date", "Timestamp", + "TimestampTZ", "Text", "TextList", ]