Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def pypowarray(numpyvec, pow):

from __future__ import annotations

import warnings
from typing import Iterable, Optional

from . import pythonization
Expand Down Expand Up @@ -296,6 +297,15 @@ def RDataFrameAsNumpy(
result_ptrs = {}
for column in columns:
column_type = df.GetColumnType(column)
if column_type == "char":
column_type = "unsigned char"
warnings.warn(
f"RDataFrame.AsNumpy: column '{column}' has type 'char', which would be automatically converted to a "
"Python string. Interpreting as 'unsigned char' instead, which results in a numpy array of dtype uint8. "
"If you use this column for numeric values, consider migrating this column to a type other than 'char', "
"which is usually used for text. For example, migrate to 'signed char' or 'unsigned char' or, "
"preferrably, 'std::int8_t' or 'std::uint8_t'."
)

Comment thread
vepadulano marked this conversation as resolved.
# If the column type is a class, make sure cling knows about it
tclass = ROOT.TClass.GetClass(column_type)
Expand Down
22 changes: 21 additions & 1 deletion bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import ROOT
from ROOT._pythonization._rdataframe import _clone_asnumpyresult
import os


def make_tree(*dtypes):
"""
Expand Down Expand Up @@ -400,6 +400,26 @@ def test_rdataframe_as_numpy_array_jagged(self):
self.assertTrue(all(isinstance(x, np.ndarray) for x in array))
self.assertTrue(all(len(x) == i for i, x in enumerate(array)))

def test_rdataframe_as_numpy_char_col_as_uint8(self):
ROOT.gInterpreter.Declare(
r"""
#ifndef ROOT_TEST_RDataFrameAsNumpy_GH_22554
#define ROOT_TEST_RDataFrameAsNumpy_GH_22554
char make_char(ULong64_t i) {
return static_cast<char>(65 + (i % 26)); // A, B, C, ...
}
#endif
"""
)

rdf = ROOT.RDataFrame(10).Define("mycol", "make_char(rdfentry_)")
with self.assertWarns(
UserWarning, msg="column 'mycol' has type 'char', which would be automatically converted"
):
npy = rdf.AsNumpy(["mycol"])["mycol"]
self.assertEqual(npy.dtype, np.uint8)
self.assertTrue(all(npy == np.array([65 + (i % 26) for i in range(10)], dtype=np.uint8)))


if __name__ == "__main__":
unittest.main()
16 changes: 13 additions & 3 deletions tree/dataframe/src/RDFUtils.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -513,9 +513,19 @@ unsigned int GetColumnWidth(const std::vector<std::string>& names, const unsigne
void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType,
const std::string &colName)
{
// We want to explicitly support the reading of bools as unsigned char, as
// this is quite common to circumvent the std::vector<bool> specialization.
const bool explicitlySupported = (colType == typeid(bool) && requestedType == typeid(unsigned char)) ? true : false;
// We explicitly support certain type conversions
const bool explicitlySupported = [&colType, &requestedType]() {
// bool as unsigned char is common to circumvent the std::vector<bool> specialization.
if (colType == typeid(bool) && requestedType == typeid(unsigned char))
return true;
// char as unsigned char allows reading a vector of char as a Python numpy array of integers, avoiding the
// automatic conversion of 'char *' to string in Python. For more info, see
// https://github.com/root-project/root/issues/22554
if (colType == typeid(char) && requestedType == typeid(unsigned char))
return true;

return false;
}();

// Here we compare names and not typeinfos since they may come from two different contexts: a compiled
// and a jitted one.
Expand Down
Loading