From 55e203f38bf5c3d391d1b2fae3148bbba6a74cbd Mon Sep 17 00:00:00 2001
From: Vincenzo Eduardo Padulano <vincenzo.eduardo.padulano@cern.ch>
Date: Thu, 11 Jun 2026 10:28:08 +0200
Subject: [PATCH] [df] Allow reading a char column into a numpy array

In the AsNumpy operation values of the dataset are read into a ROOT::RVec
collection of the corresponding column type. Subsequently, the raw data is
accessed from the RVec and used to generate the array interface for a numpy
array view on the collected data.

When the column is of type char, and thus RDF would read values into a
ROOT::RVec<char>, the raw data is accessed as a 'char *'. The Python bindings
automatically convert 'char *' and 'const char *' to Python strings for full
compatibility with existing functions (e.g. otherwise TObject::GetName would not
return a string in Python). Thus, the array interface cannot be generated.

This commit proposes to introduce a special behaviour in AsNumpy to
automatically view the char column as an 'unsigned char' column. This in turn
will not incur in the automatic conversion on the Python side. An array of
'unsigned char' is interpreted as a numpy array with dtype uint8.

Since this is a decision which might be unexpected to some users, the commit
also proposes to let the user know about it via a warning.
---
 .../python/ROOT/_pythonization/_rdataframe.py | 10 +++++++++
 .../pythonizations/test/rdataframe_asnumpy.py | 22 ++++++++++++++++++-
 tree/dataframe/src/RDFUtils.cxx               | 16 +++++++++++---
 3 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py
index a19a270c3f0db..ee50e05e7529f 100644
--- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py
+++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py
@@ -229,6 +229,7 @@ def pypowarray(numpyvec, pow):
 
 from __future__ import annotations
 
+import warnings
 from typing import Iterable, Optional
 
 from . import pythonization
@@ -296,6 +297,15 @@ def RDataFrameAsNumpy(
     result_ptrs = {}
     for column in columns:
         column_type = df.GetColumnType(column)
+        if column_type == "char":
+            column_type = "unsigned char"
+            warnings.warn(
+                f"RDataFrame.AsNumpy: column '{column}' has type 'char', which would be automatically converted to a "
+                "Python string. Interpreting as 'unsigned char' instead, which results in a numpy array of dtype uint8. "
+                "If you use this column for numeric values, consider migrating this column to a type other than 'char', "
+                "which is usually used for text. For example, migrate to 'signed char' or 'unsigned char' or, "
+                "preferrably, 'std::int8_t' or 'std::uint8_t'."
+            )
 
         # If the column type is a class, make sure cling knows about it
         tclass = ROOT.TClass.GetClass(column_type)
diff --git a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py
index 0bc84e84b5b41..2ecdd9dafa262 100644
--- a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py
+++ b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py
@@ -7,7 +7,7 @@
 import numpy as np
 import ROOT
 from ROOT._pythonization._rdataframe import _clone_asnumpyresult
-import os
+
 
 def make_tree(*dtypes):
     """
@@ -400,6 +400,26 @@ def test_rdataframe_as_numpy_array_jagged(self):
         self.assertTrue(all(isinstance(x, np.ndarray) for x in array))
         self.assertTrue(all(len(x) == i for i, x in enumerate(array)))
 
+    def test_rdataframe_as_numpy_char_col_as_uint8(self):
+        ROOT.gInterpreter.Declare(
+            r"""
+            #ifndef ROOT_TEST_RDataFrameAsNumpy_GH_22554
+            #define ROOT_TEST_RDataFrameAsNumpy_GH_22554
+            char make_char(ULong64_t i) {
+            return static_cast<char>(65 + (i % 26)); // A, B, C, ...
+            }
+            #endif
+            """
+        )
+
+        rdf = ROOT.RDataFrame(10).Define("mycol", "make_char(rdfentry_)")
+        with self.assertWarns(
+            UserWarning, msg="column 'mycol' has type 'char', which would be automatically converted"
+        ):
+            npy = rdf.AsNumpy(["mycol"])["mycol"]
+            self.assertEqual(npy.dtype, np.uint8)
+            self.assertTrue(all(npy == np.array([65 + (i % 26) for i in range(10)], dtype=np.uint8)))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tree/dataframe/src/RDFUtils.cxx b/tree/dataframe/src/RDFUtils.cxx
index 5d810b13df80e..5374fa5cb45a3 100644
--- a/tree/dataframe/src/RDFUtils.cxx
+++ b/tree/dataframe/src/RDFUtils.cxx
@@ -513,9 +513,19 @@ unsigned int GetColumnWidth(const std::vector<std::string>& names, const unsigne
 void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType,
                             const std::string &colName)
 {
-   // We want to explicitly support the reading of bools as unsigned char, as
-   // this is quite common to circumvent the std::vector<bool> specialization.
-   const bool explicitlySupported = (colType == typeid(bool) && requestedType == typeid(unsigned char)) ? true : false;
+   // We explicitly support certain type conversions
+   const bool explicitlySupported = [&colType, &requestedType]() {
+      // bool as unsigned char is common to circumvent the std::vector<bool> specialization.
+      if (colType == typeid(bool) && requestedType == typeid(unsigned char))
+         return true;
+      // char as unsigned char allows reading a vector of char as a Python numpy array of integers, avoiding the
+      // automatic conversion of 'char *' to string in Python. For more info, see
+      // https://github.com/root-project/root/issues/22554
+      if (colType == typeid(char) && requestedType == typeid(unsigned char))
+         return true;
+
+      return false;
+   }();
 
    // Here we compare names and not typeinfos since they may come from two different contexts: a compiled
    // and a jitted one.