KhiopsML
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/samples/samples_sklearn.rst‎
Lines changed: 66 additions & 0 deletions b/‎doc/samples/samples_sklearn.rst‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎khiops/samples/samples_sklearn.ipynb‎
Lines changed: 79 additions & 0 deletions b/‎khiops/samples/samples_sklearn.ipynb‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎khiops/samples/samples_sklearn.py‎
Lines changed: 70 additions & 0 deletions b/‎khiops/samples/samples_sklearn.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎khiops/sklearn/dataset.py‎
Lines changed: 39 additions & 7 deletions b/‎khiops/sklearn/dataset.py‎
Lines changed: 39 additions & 7 deletions
@@ -11,6 +11,7 @@
 ### Added
 - (`core`) Dictionary API support for dictionary, variable and variable block
   comments, and dictionary and variable block internal comments.
+- (`sklearn`) `Text` Khiops type support at the estimator level.
 
 ### Fixed
 - (General) Inconsistency between the `tools.download_datasets` function and the
 
@@ -148,6 +148,72 @@ Samples
     test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr")
     print(f"Test accuracy = {test_accuracy}")
     print(f"Test auc      = {test_auc}")
+.. autofunction:: khiops_classifier_text
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    data_df = pd.read_csv(data_table_path, sep="\t")
+
+    # Split the whole dataframe into train and test (70%-30%)
+    data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("negativereason" column)
+    X_train = data_train_df.drop("negativereason", axis=1)
+    X_test = data_test_df.drop("negativereason", axis=1)
+    y_train = data_train_df["negativereason"]
+    y_test = data_test_df["negativereason"]
+
+    # Set Pandas StringDType on the "text" column
+    X_train["text"] = X_train["text"].astype("string")
+    X_test["text"] = X_test["text"].astype("string")
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+
+    # Show the feature importance info
+    print(f"Features evaluated: {khc.n_features_evaluated_}")
+    print(f"Features selected : {khc.n_features_used_}")
+    print("Top 3 used features")
+    for i, feature in enumerate(khc.feature_used_names_[:3]):
+        print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
+    print("---")
+
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+
+    # Evaluate the accuracy metric on the test dataset
+    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
+    print(f"Test accuracy = {test_accuracy}")
+
+    # If you have Khiops Visualization installed you may open the report as follows
+    # khc.export_report_file("report.khj")
+    # kh.visualize_report("report.khj")
 .. autofunction:: khiops_classifier_multitable_star
 .. code-block:: python
 
 
@@ -149,6 +149,85 @@
     "print(f\"Test auc      = {test_auc}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_classifier_text()`\n\n",
+    "Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsClassifier\n",
+    "from sklearn import metrics\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Load the dataset into a pandas dataframe\n",
+    "data_table_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
+    ")\n",
+    "data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n",
+    "\n",
+    "# Split the whole dataframe into train and test (70%-30%)\n",
+    "data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n",
+    "\n",
+    "# Split the dataset into:\n",
+    "# - the X feature table\n",
+    "# - the y target vector (\"negativereason\" column)\n",
+    "X_train = data_train_df.drop(\"negativereason\", axis=1)\n",
+    "X_test = data_test_df.drop(\"negativereason\", axis=1)\n",
+    "y_train = data_train_df[\"negativereason\"]\n",
+    "y_test = data_test_df[\"negativereason\"]\n",
+    "\n",
+    "# Set Pandas StringDType on the \"text\" column\n",
+    "X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n",
+    "X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n",
+    "\n",
+    "# Create the classifier object\n",
+    "khc = KhiopsClassifier()\n",
+    "\n",
+    "# Train the classifier\n",
+    "khc.fit(X_train, y_train)\n",
+    "\n",
+    "# Show the feature importance info\n",
+    "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
+    "print(f\"Features selected : {khc.n_features_used_}\")\n",
+    "print(\"Top 3 used features\")\n",
+    "for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
+    "    print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the classes on the test dataset\n",
+    "y_test_pred = khc.predict(X_test)\n",
+    "print(\"Predicted classes (first 10):\")\n",
+    "print(y_test_pred[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the class probabilities on the test dataset\n",
+    "y_test_probas = khc.predict_proba(X_test)\n",
+    "print(f\"Class order: {khc.classes_}\")\n",
+    "print(\"Predicted class probabilities (first 10):\")\n",
+    "print(y_test_probas[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Evaluate the accuracy metric on the test dataset\n",
+    "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
+    "print(f\"Test accuracy = {test_accuracy}\")\n",
+    "\n",
+    "# If you have Khiops Visualization installed you may open the report as follows\n",
+    "# khc.export_report_file(\"report.khj\")\n",
+    "# kh.visualize_report(\"report.khj\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
 
@@ -142,6 +142,75 @@ def khiops_classifier_multiclass():
     print(f"Test auc      = {test_auc}")
 
 
+def khiops_classifier_text():
+    """Train a `.KhiopsClassifier` on a monotable dataframe with textual data"""
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    data_df = pd.read_csv(data_table_path, sep="\t")
+
+    # Split the whole dataframe into train and test (70%-30%)
+    data_train_df, data_test_df = train_test_split(
+        data_df, test_size=0.3, random_state=1
+    )
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("negativereason" column)
+    X_train = data_train_df.drop("negativereason", axis=1)
+    X_test = data_test_df.drop("negativereason", axis=1)
+    y_train = data_train_df["negativereason"]
+    y_test = data_test_df["negativereason"]
+
+    # Set Pandas StringDType on the "text" column
+    X_train["text"] = X_train["text"].astype("string")
+    X_test["text"] = X_test["text"].astype("string")
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+
+    # Show the feature importance info
+    print(f"Features evaluated: {khc.n_features_evaluated_}")
+    print(f"Features selected : {khc.n_features_used_}")
+    print("Top 3 used features")
+    for i, feature in enumerate(khc.feature_used_names_[:3]):
+        print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
+    print("---")
+
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+
+    # Evaluate the accuracy metric on the test dataset
+    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
+    print(f"Test accuracy = {test_accuracy}")
+
+    # If you have Khiops Visualization installed you may open the report as follows
+    # khc.export_report_file("report.khj")
+    # kh.visualize_report("report.khj")
+
+
 def khiops_classifier_multitable_star():
     """Trains a `.KhiopsClassifier` on a star multi-table dataset"""
     # Imports
@@ -831,6 +900,7 @@ def khiops_coclustering_simplify():
 exported_samples = [
     khiops_classifier,
     khiops_classifier_multiclass,
+    khiops_classifier_text,
     khiops_classifier_multitable_star,
     khiops_classifier_multitable_snowflake,
     khiops_classifier_sparse,
 
@@ -236,19 +236,40 @@ def _upgrade_mapping_spec(ds_spec):
     return new_ds_spec
 
 
-def get_khiops_type(numpy_type):
+def get_khiops_type(numpy_type, categorical_str_max_size=None):
     """Translates a numpy dtype to a Khiops dictionary type
 
     Parameters
     ----------
-    numpy_type : `numpy.dtype`:
+    numpy_type : `numpy.dtype`
         Numpy type of the column
+    categorical_str_max_size : `int`, optional
+        Maximum length of the entries of the column whose type is ``numpy_type``.
 
     Returns
     -------
     str
-        Khiops type name. Either "Categorical", "Numerical" or "Timestamp"
+        Khiops type name. Either "Categorical", "Text", "Numerical" or "Timestamp".
+
+    .. note::
+        The "Text" Khiops type is inferred if the Numpy type is "string"
+        and the maximum length of the entries of that type is greater than 100.
+
     """
+    # Check categorical_str_max_size type
+    if categorical_str_max_size is not None and not isinstance(
+        categorical_str_max_size, (int, np.int64)
+    ):
+        raise TypeError(
+            type_error_message(
+                "categorical_str_max_size",
+                categorical_str_max_size,
+                int,
+                np.int64,
+            )
+        )
+
+    # Get the Numpy dtype in lowercase
     lower_numpy_type = str(numpy_type).lower()
 
     # timedelta64 and datetime64 types
@@ -257,6 +278,11 @@ def get_khiops_type(numpy_type):
     # float<x>, int<x>, uint<x> types
     elif "int" in lower_numpy_type or "float" in lower_numpy_type:
         khiops_type = "Numerical"
+    elif lower_numpy_type == "string":
+        if categorical_str_max_size is not None and categorical_str_max_size > 100:
+            khiops_type = "Text"
+        else:
+            khiops_type = "Categorical"
     # bool_ and object, character, bytes_, str_, void, record and other types
     else:
         khiops_type = "Categorical"
@@ -956,10 +982,16 @@ def __init__(self, name, dataframe, key=None):
                 )
 
         # Initialize Khiops types
-        self.khiops_types = {
-            column_id: get_khiops_type(self.data_source.dtypes[column_id])
-            for column_id in self.column_ids
-        }
+        self.khiops_types = {}
+        for column_id in self.column_ids:
+            column = self.data_source[column_id]
+            column_numpy_type = column.dtype
+            column_max_size = None
+            if isinstance(column_numpy_type, pd.StringDtype):
+                column_max_size = column.str.len().max()
+            self.khiops_types[column_id] = get_khiops_type(
+                column_numpy_type, column_max_size
+            )
 
         # Check key integrity
         self.check_key()