From 55e203f38bf5c3d391d1b2fae3148bbba6a74cbd Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Thu, 11 Jun 2026 10:28:08 +0200 Subject: [PATCH] [df] Allow reading a char column into a numpy array In the AsNumpy operation values of the dataset are read into a ROOT::RVec collection of the corresponding column type. Subsequently, the raw data is accessed from the RVec and used to generate the array interface for a numpy array view on the collected data. When the column is of type char, and thus RDF would read values into a ROOT::RVec, the raw data is accessed as a 'char *'. The Python bindings automatically convert 'char *' and 'const char *' to Python strings for full compatibility with existing functions (e.g. otherwise TObject::GetName would not return a string in Python). Thus, the array interface cannot be generated. This commit proposes to introduce a special behaviour in AsNumpy to automatically view the char column as an 'unsigned char' column. This in turn will not incur in the automatic conversion on the Python side. An array of 'unsigned char' is interpreted as a numpy array with dtype uint8. Since this is a decision which might be unexpected to some users, the commit also proposes to let the user know about it via a warning. --- .../python/ROOT/_pythonization/_rdataframe.py | 10 +++++++++ .../pythonizations/test/rdataframe_asnumpy.py | 22 ++++++++++++++++++- tree/dataframe/src/RDFUtils.cxx | 16 +++++++++++--- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py index a19a270c3f0db..ee50e05e7529f 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py @@ -229,6 +229,7 @@ def pypowarray(numpyvec, pow): from __future__ import annotations +import warnings from typing import Iterable, Optional from . import pythonization @@ -296,6 +297,15 @@ def RDataFrameAsNumpy( result_ptrs = {} for column in columns: column_type = df.GetColumnType(column) + if column_type == "char": + column_type = "unsigned char" + warnings.warn( + f"RDataFrame.AsNumpy: column '{column}' has type 'char', which would be automatically converted to a " + "Python string. Interpreting as 'unsigned char' instead, which results in a numpy array of dtype uint8. " + "If you use this column for numeric values, consider migrating this column to a type other than 'char', " + "which is usually used for text. For example, migrate to 'signed char' or 'unsigned char' or, " + "preferrably, 'std::int8_t' or 'std::uint8_t'." + ) # If the column type is a class, make sure cling knows about it tclass = ROOT.TClass.GetClass(column_type) diff --git a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py index 0bc84e84b5b41..2ecdd9dafa262 100644 --- a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py +++ b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py @@ -7,7 +7,7 @@ import numpy as np import ROOT from ROOT._pythonization._rdataframe import _clone_asnumpyresult -import os + def make_tree(*dtypes): """ @@ -400,6 +400,26 @@ def test_rdataframe_as_numpy_array_jagged(self): self.assertTrue(all(isinstance(x, np.ndarray) for x in array)) self.assertTrue(all(len(x) == i for i, x in enumerate(array))) + def test_rdataframe_as_numpy_char_col_as_uint8(self): + ROOT.gInterpreter.Declare( + r""" + #ifndef ROOT_TEST_RDataFrameAsNumpy_GH_22554 + #define ROOT_TEST_RDataFrameAsNumpy_GH_22554 + char make_char(ULong64_t i) { + return static_cast(65 + (i % 26)); // A, B, C, ... + } + #endif + """ + ) + + rdf = ROOT.RDataFrame(10).Define("mycol", "make_char(rdfentry_)") + with self.assertWarns( + UserWarning, msg="column 'mycol' has type 'char', which would be automatically converted" + ): + npy = rdf.AsNumpy(["mycol"])["mycol"] + self.assertEqual(npy.dtype, np.uint8) + self.assertTrue(all(npy == np.array([65 + (i % 26) for i in range(10)], dtype=np.uint8))) + if __name__ == "__main__": unittest.main() diff --git a/tree/dataframe/src/RDFUtils.cxx b/tree/dataframe/src/RDFUtils.cxx index 5d810b13df80e..5374fa5cb45a3 100644 --- a/tree/dataframe/src/RDFUtils.cxx +++ b/tree/dataframe/src/RDFUtils.cxx @@ -513,9 +513,19 @@ unsigned int GetColumnWidth(const std::vector& names, const unsigne void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType, const std::string &colName) { - // We want to explicitly support the reading of bools as unsigned char, as - // this is quite common to circumvent the std::vector specialization. - const bool explicitlySupported = (colType == typeid(bool) && requestedType == typeid(unsigned char)) ? true : false; + // We explicitly support certain type conversions + const bool explicitlySupported = [&colType, &requestedType]() { + // bool as unsigned char is common to circumvent the std::vector specialization. + if (colType == typeid(bool) && requestedType == typeid(unsigned char)) + return true; + // char as unsigned char allows reading a vector of char as a Python numpy array of integers, avoiding the + // automatic conversion of 'char *' to string in Python. For more info, see + // https://github.com/root-project/root/issues/22554 + if (colType == typeid(char) && requestedType == typeid(unsigned char)) + return true; + + return false; + }(); // Here we compare names and not typeinfos since they may come from two different contexts: a compiled // and a jitted one.