Add Text data Sklearn sample

popescu-v · popescu-v · commit d8d9b88ba78d · 2025-09-01T15:30:54.000+02:00
diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -95,6 +95,72 @@ Samples
     print(f"Test accuracy = {test_accuracy}")
     print(f"Test auc      = {test_auc}")
 
+    # If you have Khiops Visualization installed you may open the report as follows
+    # khc.export_report_file("report.khj")
+    # kh.visualize_report("report.khj")
+.. autofunction:: khiops_classifier_text
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    data_df = pd.read_csv(data_table_path, sep="\t")
+
+    # Split the whole dataframe into train and test (70%-30%)
+    data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("negativereason" column)
+    X_train = data_train_df.drop("negativereason", axis=1)
+    X_test = data_test_df.drop("negativereason", axis=1)
+    y_train = data_train_df["negativereason"]
+    y_test = data_test_df["negativereason"]
+
+    # Set Pandas StringDType on the "text" column
+    X_train["text"] = X_train["text"].astype("string")
+    X_test["text"] = X_test["text"].astype("string")
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+
+    # Show the feature importance info
+    print(f"Features evaluated: {khc.n_features_evaluated_}")
+    print(f"Features selected : {khc.n_features_used_}")
+    print("Top 3 used features")
+    for i, feature in enumerate(khc.feature_used_names_[:3]):
+        print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
+    print("---")
+
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+
+    # Evaluate the accuracy metric on the test dataset
+    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
+    print(f"Test accuracy = {test_accuracy}")
+
     # If you have Khiops Visualization installed you may open the report as follows
     # khc.export_report_file("report.khj")
     # kh.visualize_report("report.khj")
diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -86,6 +86,85 @@
     "# kh.visualize_report(\"report.khj\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_classifier_text()`\n\n",
+    "Train a `.KhiopsClassifier` on a monotable dataframe with textual data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsClassifier\n",
+    "from sklearn import metrics\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Load the dataset into a pandas dataframe\n",
+    "data_table_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
+    ")\n",
+    "data_df = pd.read_csv(data_table_path, sep=\"\\t\")\n",
+    "\n",
+    "# Split the whole dataframe into train and test (70%-30%)\n",
+    "data_train_df, data_test_df = train_test_split(data_df, test_size=0.3, random_state=1)\n",
+    "\n",
+    "# Split the dataset into:\n",
+    "# - the X feature table\n",
+    "# - the y target vector (\"negativereason\" column)\n",
+    "X_train = data_train_df.drop(\"negativereason\", axis=1)\n",
+    "X_test = data_test_df.drop(\"negativereason\", axis=1)\n",
+    "y_train = data_train_df[\"negativereason\"]\n",
+    "y_test = data_test_df[\"negativereason\"]\n",
+    "\n",
+    "# Set Pandas StringDType on the \"text\" column\n",
+    "X_train[\"text\"] = X_train[\"text\"].astype(\"string\")\n",
+    "X_test[\"text\"] = X_test[\"text\"].astype(\"string\")\n",
+    "\n",
+    "# Create the classifier object\n",
+    "khc = KhiopsClassifier()\n",
+    "\n",
+    "# Train the classifier\n",
+    "khc.fit(X_train, y_train)\n",
+    "\n",
+    "# Show the feature importance info\n",
+    "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n",
+    "print(f\"Features selected : {khc.n_features_used_}\")\n",
+    "print(\"Top 3 used features\")\n",
+    "for i, feature in enumerate(khc.feature_used_names_[:3]):\n",
+    "    print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the classes on the test dataset\n",
+    "y_test_pred = khc.predict(X_test)\n",
+    "print(\"Predicted classes (first 10):\")\n",
+    "print(y_test_pred[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the class probabilities on the test dataset\n",
+    "y_test_probas = khc.predict_proba(X_test)\n",
+    "print(f\"Class order: {khc.classes_}\")\n",
+    "print(\"Predicted class probabilities (first 10):\")\n",
+    "print(y_test_probas[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Evaluate the accuracy metric on the test dataset\n",
+    "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
+    "print(f\"Test accuracy = {test_accuracy}\")\n",
+    "\n",
+    "# If you have Khiops Visualization installed you may open the report as follows\n",
+    "# khc.export_report_file(\"report.khj\")\n",
+    "# kh.visualize_report(\"report.khj\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py
@@ -24,6 +24,75 @@
 # pylint: disable=import-outside-toplevel,redefined-outer-name,reimported
 
 
+def khiops_classifier_text():
+    """Train a `.KhiopsClassifier` on a monotable dataframe with textual data"""
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    data_df = pd.read_csv(data_table_path, sep="\t")
+
+    # Split the whole dataframe into train and test (70%-30%)
+    data_train_df, data_test_df = train_test_split(
+        data_df, test_size=0.3, random_state=1
+    )
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("negativereason" column)
+    X_train = data_train_df.drop("negativereason", axis=1)
+    X_test = data_test_df.drop("negativereason", axis=1)
+    y_train = data_train_df["negativereason"]
+    y_test = data_test_df["negativereason"]
+
+    # Set Pandas StringDType on the "text" column
+    X_train["text"] = X_train["text"].astype("string")
+    X_test["text"] = X_test["text"].astype("string")
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+
+    # Show the feature importance info
+    print(f"Features evaluated: {khc.n_features_evaluated_}")
+    print(f"Features selected : {khc.n_features_used_}")
+    print("Top 3 used features")
+    for i, feature in enumerate(khc.feature_used_names_[:3]):
+        print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
+    print("---")
+
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+
+    # Evaluate the accuracy metric on the test dataset
+    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
+    print(f"Test accuracy = {test_accuracy}")
+
+    # If you have Khiops Visualization installed you may open the report as follows
+    # khc.export_report_file("report.khj")
+    # kh.visualize_report("report.khj")
+
+
 def khiops_classifier():
     """Trains a `.KhiopsClassifier` on a monotable dataframe"""
     # Imports
@@ -830,6 +899,7 @@ def khiops_coclustering_simplify():
 
 exported_samples = [
     khiops_classifier,
+    khiops_classifier_text,
     khiops_classifier_multiclass,
     khiops_classifier_multitable_star,
     khiops_classifier_multitable_snowflake,