diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py index a19a270c3f0db..ee50e05e7529f 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py @@ -229,6 +229,7 @@ def pypowarray(numpyvec, pow): from __future__ import annotations +import warnings from typing import Iterable, Optional from . import pythonization @@ -296,6 +297,15 @@ def RDataFrameAsNumpy( result_ptrs = {} for column in columns: column_type = df.GetColumnType(column) + if column_type == "char": + column_type = "unsigned char" + warnings.warn( + f"RDataFrame.AsNumpy: column '{column}' has type 'char', which would be automatically converted to a " + "Python string. Interpreting as 'unsigned char' instead, which results in a numpy array of dtype uint8. " + "If you use this column for numeric values, consider migrating this column to a type other than 'char', " + "which is usually used for text. For example, migrate to 'signed char' or 'unsigned char' or, " + "preferrably, 'std::int8_t' or 'std::uint8_t'." + ) # If the column type is a class, make sure cling knows about it tclass = ROOT.TClass.GetClass(column_type) diff --git a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py index 0bc84e84b5b41..2ecdd9dafa262 100644 --- a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py +++ b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py @@ -7,7 +7,7 @@ import numpy as np import ROOT from ROOT._pythonization._rdataframe import _clone_asnumpyresult -import os + def make_tree(*dtypes): """ @@ -400,6 +400,26 @@ def test_rdataframe_as_numpy_array_jagged(self): self.assertTrue(all(isinstance(x, np.ndarray) for x in array)) self.assertTrue(all(len(x) == i for i, x in enumerate(array))) + def test_rdataframe_as_numpy_char_col_as_uint8(self): + ROOT.gInterpreter.Declare( + r""" + #ifndef ROOT_TEST_RDataFrameAsNumpy_GH_22554 + #define ROOT_TEST_RDataFrameAsNumpy_GH_22554 + char make_char(ULong64_t i) { + return static_cast(65 + (i % 26)); // A, B, C, ... + } + #endif + """ + ) + + rdf = ROOT.RDataFrame(10).Define("mycol", "make_char(rdfentry_)") + with self.assertWarns( + UserWarning, msg="column 'mycol' has type 'char', which would be automatically converted" + ): + npy = rdf.AsNumpy(["mycol"])["mycol"] + self.assertEqual(npy.dtype, np.uint8) + self.assertTrue(all(npy == np.array([65 + (i % 26) for i in range(10)], dtype=np.uint8))) + if __name__ == "__main__": unittest.main() diff --git a/tree/dataframe/src/RDFUtils.cxx b/tree/dataframe/src/RDFUtils.cxx index 5d810b13df80e..5374fa5cb45a3 100644 --- a/tree/dataframe/src/RDFUtils.cxx +++ b/tree/dataframe/src/RDFUtils.cxx @@ -513,9 +513,19 @@ unsigned int GetColumnWidth(const std::vector& names, const unsigne void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType, const std::string &colName) { - // We want to explicitly support the reading of bools as unsigned char, as - // this is quite common to circumvent the std::vector specialization. - const bool explicitlySupported = (colType == typeid(bool) && requestedType == typeid(unsigned char)) ? true : false; + // We explicitly support certain type conversions + const bool explicitlySupported = [&colType, &requestedType]() { + // bool as unsigned char is common to circumvent the std::vector specialization. + if (colType == typeid(bool) && requestedType == typeid(unsigned char)) + return true; + // char as unsigned char allows reading a vector of char as a Python numpy array of integers, avoiding the + // automatic conversion of 'char *' to string in Python. For more info, see + // https://github.com/root-project/root/issues/22554 + if (colType == typeid(char) && requestedType == typeid(unsigned char)) + return true; + + return false; + }(); // Here we compare names and not typeinfos since they may come from two different contexts: a compiled // and a jitted one.