[doc][Python] Add doxygen docstrings to the ML section + refactoring

siliataider · siliataider · commit e608034cf922 · 2026-05-19T14:45:58.000+02:00
diff --git a/bindings/pyroot/pythonizations/doc/index.md b/bindings/pyroot/pythonizations/doc/index.md
@@ -6,7 +6,7 @@
 ROOT is a C++ framework used across HEP for data storage, analysis and visualisation. Its full API is available directly in Python through dynamic bindings powered by [cppyy](https://cppyy.readthedocs.io/). Every ROOT class you see in the
 C++ documentation is accessible from Python under the `ROOT` module.
 
-On top of that, a set of [pythonizations](@ref Pythonizations) adapt selected classes to feel more natively Pythonic: operator overloading, iterators, NumPy interoperability, and more.
+On top of that, a set of @ref Pythonizations adapt selected classes to feel more natively Pythonic: operator overloading, iterators, NumPy interoperability, and more.
 
 
 # Installation
@@ -100,19 +100,19 @@ h.Fill(data)
 
 # Write it to a ROOT file
 with ROOT.TFile.Open("output.root", "RECREATE") as f:
-    h.Write()
+    f.WriteObject(h, "my_histogram")
 ~~~
 
-Now we create an RDataFrame from scratch, define a new column with a Python lambda and draw a histogram:
+Now we create an @ref dataframe  - ROOT's high-level interface for columnar data analysis - from scratch, define a new column and draw a histogram:
 
 ~~~{.py}
 import numpy as np
 
 # Create an RDataFrame with 10000 rows
 rdf = ROOT.RDataFrame(10000)
 
-# Define a column x
-rdf = rdf.Define("x", lambda : np.random.normal(0, 1))
+# Define a column x representing a normal distribution
+rdf = rdf.Define("x", "gRandom->Gaus(0, 1)")
 
 # Draw a histogram of x
 rdf.Histo1D("x").Draw()
diff --git a/bindings/pyroot/pythonizations/doc/interop.md b/bindings/pyroot/pythonizations/doc/interop.md
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py
@@ -519,6 +519,7 @@ def __exit__(self, type, value, traceback):
 # formatted iterator (returned by as_torch / as_numpy / as_tensorflow)
 class FormattedLoader:
     """
+    \ingroup Py_ML
     Iterable that converts each batch to the requested format.
     Returned by the as_torch / as_numpy / as_tensorflow methods on RDataLoader.
     """
@@ -550,6 +551,7 @@ def __iter__(self):
 
 class RDataLoader:
     """
+    \ingroup Py_ML
     Entry point for ML batch loading from a ROOT RDataFrame.
 
     Usage without a validation split::
@@ -587,6 +589,8 @@ def __init__(
         replacement: bool = False,
     ) -> None:
         """
+        \ingroup Py_ML
+
         Args:
             rdataframes:
                 RDataFrame or list of RDataFrames to load from.
@@ -699,13 +703,15 @@ def train_test_split(self, test_size: float = 0.2) -> Tuple[RDataLoader, RDataLo
 
     def as_numpy(self) -> FormattedLoader:
         """
+        \ingroup Py_ML
         Return an iterable that yields batches as NumPy arrays.
         """
         self._ensure_created()
         return FormattedLoader(self._internal, self._internal.ConvertBatchToNumpy, self._is_training)
 
     def as_torch(self, device: str | torch.device | None = None) -> FormattedLoader:
         """
+        \ingroup Py_ML
         Return an iterable that yields batches as PyTorch tensors.
 
         Args:
@@ -717,6 +723,7 @@ def as_torch(self, device: str | torch.device | None = None) -> FormattedLoader:
 
     def as_tensorflow(self) -> tf.data.Dataset:
         """
+        \ingroup Py_ML
         Return a tf.data.Dataset over batches as TensorFlow tensors.
         """
         import tensorflow as tf
@@ -751,14 +758,20 @@ def as_tensorflow(self) -> tf.data.Dataset:
 
     @property
     def columns(self) -> list[str]:
-        """All column names as they appear in each batch tensor."""
+        """
+        \ingroup Py_ML
+        All column names as they appear in each batch tensor.
+        """
         if self._internal is None:
             return self._params["columns"]
         return self._internal.all_columns
 
     @property
     def train_columns(self) -> list[str]:
-        """Feature column names (columns minus target and weights)."""
+        """
+        \ingroup Py_ML
+        Feature column names (columns minus target and weights).
+        """
         if self._internal is None:
             target = self._params["target"] if self._params["target"] is not None else []
             weights = self._params["weights"] if self._params["weights"] is not None else []
@@ -767,21 +780,30 @@ def train_columns(self) -> list[str]:
 
     @property
     def target_columns(self) -> list[str]:
-        """Target column names."""
+        """
+        \ingroup Py_ML
+        Target column names.
+        """
         if self._internal is None:
             return self._params["target"] if self._params["target"] is not None else []
         return self._internal.target_columns
 
     @property
     def weights_column(self) -> str:
-        """Weights column name, or empty string if not set."""
+        """
+        \ingroup Py_ML
+        Weights column name, or empty string if not set.
+        """
         if self._internal is None:
             return self._params["weights"] if self._params["weights"] is not None else ""
         return self._internal.weights_column
 
     @property
     def num_batches(self) -> int:
-        """Total number of batches in this split for one epoch."""
+        """
+        \ingroup Py_ML
+        Total number of batches in this split for one epoch.
+        """
         if self._internal is None:
             raise RuntimeError(
                 "num_batches is available after the first call to "
@@ -793,7 +815,10 @@ def num_batches(self) -> int:
 
     @property
     def last_batch_no_of_rows(self) -> int:
-        """Number of rows in the last (remainder) batch, 0 if no remainder."""
+        """
+        \ingroup Py_ML
+        Number of rows in the last (remainder) batch, 0 if no remainder.
+        """
         if self._internal is None:
             raise RuntimeError(
                 "last_batch_no_of_rows is available after the first call to "
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/dataloader.md b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/dataloader.md
@@ -3,7 +3,7 @@
 \brief Feed ROOT data directly into models for machine learning training.
 
 
-`RDataLoader` streams ROOT data into machine learning frameworks as batches ready for training. It takes any [RDataFrame](@ref Py_RDataFrame) as input, giving you access to the full ROOT ecosystem for filtering, defining new variables and applying selections; it delivers batches of your dataset for [NumPy](https://numpy.org/devdocs/reference/generated/numpy.ndarray.html), [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) and [PyTorch](https://docs.pytorch.org/docs/main/tensors.html) through a simple iteration interface.
+`RDataLoader` streams ROOT data into machine learning frameworks as batches ready for training. It takes any @ref dataframe as input, giving you access to the full ROOT ecosystem for filtering, defining new variables and applying selections; it delivers batches of your dataset for [NumPy](https://numpy.org/devdocs/reference/generated/numpy.ndarray.html), [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) and [PyTorch](https://docs.pytorch.org/docs/main/tensors.html) through a simple iteration interface.
 
 \note `RDataLoader` is part of `ROOT.Experimental.ML` and is currently experimental. The API may change between ROOT releases.
 
@@ -28,7 +28,7 @@ A one-page quick reference covering the API.
 
 ## Getting your data ready
 
-`RDataLoader` takes an `RDataFrame` as input. This means your data preparation (selecting events, computing
+`RDataLoader` takes an @ref dataframe as input. This means your data preparation (selecting events, computing
 new variables, applying cuts, etc.) all happens before the loader is created, using the full power of `RDataFrame`:
 
 ~~~{.py}
@@ -38,13 +38,9 @@ import ROOT
 # Open a ROOT file and create an RDataFrame
 rdf = ROOT.RDataFrame("events", "file.root")
 
-# Define a Python callback to compute a new variable
-def invariant_mass(E: float, p: float) -> float:
-    return math.sqrt(E**2 - p**2)
-
 # Apply selections and compute derived features
 rdf = rdf.Filter("nMuons >= 2") \
-         .Define("inv_mass", invariant_mass, ["E", "p"])
+         .Define("inv_mass", "sqrt(E*E - p*p)")
 ~~~
 
 Then pass your `RDataFrame` to `RDataLoader`:
@@ -138,7 +134,7 @@ dl = RDataLoader(
 # events with fewer than 10 jets are zero-padded
 ~~~
 
-\warning Every RVec column in `columns` must appear in `max_vec_sizes`.
+\warning Every vector column in `columns` must appear in `max_vec_sizes`.
 
 ## Iterating Batches
 
@@ -212,6 +208,14 @@ train, val = train_val.train_test_split(test_size=0.176)
 
 ## Advanced Features
 
+### Eager loading
+
+By default the loader reads data lazily, one chunk of data at a time. For small datasets that fit in memory and will be iterated many times, eager loading pays a one-time cost at construction and then serves batches every epoch from memory:
+
+~~~{.py}
+dl = RDataLoader(rdf, batch_size=256, load_eager=True)
+~~~
+
 ### Resampling
 
 Correct class imbalance by oversampling the minority or undersampling the majority. You can do this by passing two RDataFrames:
@@ -244,33 +248,3 @@ dl = RDataLoader(rdf,
 for X, y, w in dl.as_torch():
     loss = (loss_fn(model(X), y) * w).mean()
 ~~~
-
-### Eager loading
-
-By default the loader reads data lazily, one chunk of data at a time. For small datasets that fit in memory and will be iterated many times, eager loading pays a one-time cost at construction and then serves every epoch from memory:
-
-~~~{.py}
-dl = RDataLoader(rdf, batch_size=256, load_eager=True)
-~~~
-
-## API Reference
-
-### RDataLoader(rdataframes, ...)
-
-| Argument | Type | Default | Description |
-|---|---|---|---|
-| `rdataframes` | `RDF \| list` | - | One or more RDataFrames to load from |
-| `batch_size` | `int` | `64` | Number of events per batch |
-| `batches_in_memory` | `int` | `10` | Shuffle buffer size in batches |
-| `columns` | `list[str]` | `None` | Branches to load - all if not given |
-| `max_vec_sizes` | `dict` | `None` | Max size per RVec column |
-| `vec_padding` | `float` | `0.0` | Pad value for short RVec entries |
-| `target` | `str \| list` | `None` | Label column(s) - returned as `y` |
-| `weights` | `str` | `""` | Event weight column - returned as `w` |
-| `shuffle` | `bool` | `True` | Randomise event order |
-| `drop_remainder` | `bool` | `True` | Drop last incomplete batch |
-| `set_seed` | `int` | `0` | RNG seed - 0 means random |
-| `load_eager` | `bool` | `False` | Load full dataset into RAM |
-| `sampling_type` | `str` | `""` | `"oversampling"` or `"undersampling"` |
-| `sampling_ratio` | `float` | `1.0` | Minority/majority ratio after resampling |
-| `replacement` | `bool` | `False` | Undersampling with replacement |