Skip to content

Commit f369ced

Browse files
committed
[df] Allow reading a char column into a numpy array
In the AsNumpy operation values of the dataset are read into a ROOT::RVec collection of the corresponding column type. Subsequently, the raw data is accessed from the RVec and used to generate the array interface for a numpy array view on the collected data. When the column is of type char, and thus RDF would read values into a ROOT::RVec<char>, the raw data is accessed as a 'char *'. The Python bindings automatically convert 'char *' and 'const char *' to Python strings for full compatibility with existing functions (e.g. otherwise TObject::GetName would not return a string in Python). Thus, the array interface cannot be generated. This commit proposes to introduce a special behaviour in AsNumpy to automatically view the char column as an 'unsigned char' column. This in turn will not incur in the automatic conversion on the Python side. An array of 'unsigned char' is interpreted as a numpy array with dtype uint8. Since this is a decision which might be unexpected to some users, the commit also proposes to let the user know about it via a warning.
1 parent 283b12f commit f369ced

3 files changed

Lines changed: 44 additions & 4 deletions

File tree

bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def pypowarray(numpyvec, pow):
229229

230230
from __future__ import annotations
231231

232+
import warnings
232233
from typing import Iterable, Optional
233234

234235
from . import pythonization
@@ -296,6 +297,15 @@ def RDataFrameAsNumpy(
296297
result_ptrs = {}
297298
for column in columns:
298299
column_type = df.GetColumnType(column)
300+
if column_type == "char":
301+
column_type = "unsigned char"
302+
warnings.warn(
303+
f"RDataFrame.AsNumpy: column '{column}' has type 'char', which would be automatically converted to a "
304+
"Python string. Interpreting as 'unsigned char' instead, which results in a numpy array of dtype uint8. "
305+
"If you use this column for numeric values, consider migrating this column to a type other than 'char', "
306+
"which is usually used for text. For example, migrate to 'signed char' or 'unsigned char' or, "
307+
"preferrably, 'std::int8_t' or 'std::uint8_t'."
308+
)
299309

300310
# If the column type is a class, make sure cling knows about it
301311
tclass = ROOT.TClass.GetClass(column_type)

bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88
import ROOT
99
from ROOT._pythonization._rdataframe import _clone_asnumpyresult
10-
import os
10+
1111

1212
def make_tree(*dtypes):
1313
"""
@@ -400,6 +400,26 @@ def test_rdataframe_as_numpy_array_jagged(self):
400400
self.assertTrue(all(isinstance(x, np.ndarray) for x in array))
401401
self.assertTrue(all(len(x) == i for i, x in enumerate(array)))
402402

403+
def test_rdataframe_as_numpy_char_col_as_uint8(self):
404+
ROOT.gInterpreter.Declare(
405+
r"""
406+
#ifndef ROOT_TEST_RDataFrameAsNumpy_GH_22554
407+
#define ROOT_TEST_RDataFrameAsNumpy_GH_22554
408+
char make_char(ULong64_t i) {
409+
return static_cast<char>(65 + (i % 26)); // A, B, C, ...
410+
}
411+
#endif
412+
"""
413+
)
414+
415+
rdf = ROOT.RDataFrame(10).Define("mycol", "make_char(rdfentry_)")
416+
with self.assertWarns(
417+
UserWarning, msg="column 'mycol' has type 'char', which would be automatically converted"
418+
):
419+
npy = rdf.AsNumpy(["mycol"])["mycol"]
420+
self.assertEqual(npy.dtype, np.uint8)
421+
self.assertTrue(all(npy == np.array([65 + (i % 26) for i in range(10)], dtype=np.uint8)))
422+
403423

404424
if __name__ == "__main__":
405425
unittest.main()

tree/dataframe/src/RDFUtils.cxx

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -513,9 +513,19 @@ unsigned int GetColumnWidth(const std::vector<std::string>& names, const unsigne
513513
void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType,
514514
const std::string &colName)
515515
{
516-
// We want to explicitly support the reading of bools as unsigned char, as
517-
// this is quite common to circumvent the std::vector<bool> specialization.
518-
const bool explicitlySupported = (colType == typeid(bool) && requestedType == typeid(unsigned char)) ? true : false;
516+
// We explicitly support certain type conversions
517+
const bool explicitlySupported = [&colType, &requestedType]() {
518+
// bool as unsigned char is common to circumvent the std::vector<bool> specialization.
519+
if (colType == typeid(bool) && requestedType == typeid(unsigned char))
520+
return true;
521+
// char as unsigned char allows reading a vector of char as a Python numpy array of integers, avoiding the
522+
// automatic conversion of 'char *' to string in Python. For more info, see
523+
// https://github.com/root-project/root/issues/22554
524+
if (colType == typeid(char) && requestedType == typeid(unsigned char))
525+
return true;
526+
527+
return false;
528+
}();
519529

520530
// Here we compare names and not typeinfos since they may come from two different contexts: a compiled
521531
// and a jitted one.

0 commit comments

Comments
 (0)