[doc][Python] Add doxygen docstrings to the ML section + refactoring

siliataider · siliataider · commit c4e18aa67ec7 · 2026-05-18T12:04:08.000+02:00
diff --git a/bindings/pyroot/pythonizations/doc/index.md b/bindings/pyroot/pythonizations/doc/index.md
@@ -100,7 +100,7 @@ h.Fill(data)
 
 # Write it to a ROOT file
 with ROOT.TFile.Open("output.root", "RECREATE") as f:
-    h.Write()
+    f.WriteObject(h, "my_histogram")
 ~~~
 
 Now we create an RDataFrame from scratch, define a new column with a Python lambda and draw a histogram:
@@ -111,8 +111,8 @@ import numpy as np
 # Create an RDataFrame with 10000 rows
 rdf = ROOT.RDataFrame(10000)
 
-# Define a column x
-rdf = rdf.Define("x", lambda : np.random.normal(0, 1))
+# Define a column x representing a normal distribution
+rdf = rdf.Define("x", "gRandom->Gaus(0, 1)")
 
 # Draw a histogram of x
 rdf.Histo1D("x").Draw()
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py
@@ -519,6 +519,7 @@ def __exit__(self, type, value, traceback):
 # formatted iterator (returned by as_torch / as_numpy / as_tensorflow)
 class FormattedLoader:
     """
+    \ingroup Py_ML
     Iterable that converts each batch to the requested format.
     Returned by the as_torch / as_numpy / as_tensorflow methods on RDataLoader.
     """
@@ -550,6 +551,7 @@ def __iter__(self):
 
 class RDataLoader:
     """
+    \ingroup Py_ML
     Entry point for ML batch loading from a ROOT RDataFrame.
 
     Usage without a validation split::
@@ -587,6 +589,8 @@ def __init__(
         replacement: bool = False,
     ) -> None:
         """
+        \ingroup Py_ML
+
         Args:
             rdataframes:
                 RDataFrame or list of RDataFrames to load from.
@@ -699,13 +703,15 @@ def train_test_split(self, test_size: float = 0.2) -> Tuple[RDataLoader, RDataLo
 
     def as_numpy(self) -> FormattedLoader:
         """
+        \ingroup Py_ML
         Return an iterable that yields batches as NumPy arrays.
         """
         self._ensure_created()
         return FormattedLoader(self._internal, self._internal.ConvertBatchToNumpy, self._is_training)
 
     def as_torch(self, device: str | torch.device | None = None) -> FormattedLoader:
         """
+        \ingroup Py_ML
         Return an iterable that yields batches as PyTorch tensors.
 
         Args:
@@ -717,6 +723,7 @@ def as_torch(self, device: str | torch.device | None = None) -> FormattedLoader:
 
     def as_tensorflow(self) -> tf.data.Dataset:
         """
+        \ingroup Py_ML
         Return a tf.data.Dataset over batches as TensorFlow tensors.
         """
         import tensorflow as tf
@@ -751,14 +758,20 @@ def as_tensorflow(self) -> tf.data.Dataset:
 
     @property
     def columns(self) -> list[str]:
-        """All column names as they appear in each batch tensor."""
+        """
+        \ingroup Py_ML
+        All column names as they appear in each batch tensor.
+        """
         if self._internal is None:
             return self._params["columns"]
         return self._internal.all_columns
 
     @property
     def train_columns(self) -> list[str]:
-        """Feature column names (columns minus target and weights)."""
+        """
+        \ingroup Py_ML
+        Feature column names (columns minus target and weights).
+        """
         if self._internal is None:
             target = self._params["target"] if self._params["target"] is not None else []
             weights = self._params["weights"] if self._params["weights"] is not None else []
@@ -767,21 +780,29 @@ def train_columns(self) -> list[str]:
 
     @property
     def target_columns(self) -> list[str]:
-        """Target column names."""
+        """\ingroup Py_ML
+        Target column names.
+        """
         if self._internal is None:
             return self._params["target"] if self._params["target"] is not None else []
         return self._internal.target_columns
 
     @property
     def weights_column(self) -> str:
-        """Weights column name, or empty string if not set."""
+        """
+        \ingroup Py_ML
+        Weights column name, or empty string if not set.
+        """
         if self._internal is None:
             return self._params["weights"] if self._params["weights"] is not None else ""
         return self._internal.weights_column
 
     @property
     def num_batches(self) -> int:
-        """Total number of batches in this split for one epoch."""
+        """
+        \ingroup Py_ML
+        Total number of batches in this split for one epoch.
+        """
         if self._internal is None:
             raise RuntimeError(
                 "num_batches is available after the first call to "
@@ -793,7 +814,10 @@ def num_batches(self) -> int:
 
     @property
     def last_batch_no_of_rows(self) -> int:
-        """Number of rows in the last (remainder) batch, 0 if no remainder."""
+        """
+        \ingroup Py_ML
+        Number of rows in the last (remainder) batch, 0 if no remainder.
+        """
         if self._internal is None:
             raise RuntimeError(
                 "last_batch_no_of_rows is available after the first call to "
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/dataloader.md b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/dataloader.md
@@ -38,13 +38,9 @@ import ROOT
 # Open a ROOT file and create an RDataFrame
 rdf = ROOT.RDataFrame("events", "file.root")
 
-# Define a Python callback to compute a new variable
-def invariant_mass(E: float, p: float) -> float:
-    return math.sqrt(E**2 - p**2)
-
 # Apply selections and compute derived features
 rdf = rdf.Filter("nMuons >= 2") \
-         .Define("inv_mass", invariant_mass, ["E", "p"])
+         .Define("inv_mass", "sqrt(E*E - p*p)")
 ~~~
 
 Then pass your `RDataFrame` to `RDataLoader`:
@@ -138,7 +134,7 @@ dl = RDataLoader(
 # events with fewer than 10 jets are zero-padded
 ~~~
 
-\warning Every RVec column in `columns` must appear in `max_vec_sizes`.
+\warning Every vector column in `columns` must appear in `max_vec_sizes`.
 
 ## Iterating Batches
 
@@ -212,6 +208,14 @@ train, val = train_val.train_test_split(test_size=0.176)
 
 ## Advanced Features
 
+### Eager loading
+
+By default the loader reads data lazily, one chunk of data at a time. For small datasets that fit in memory and will be iterated many times, eager loading pays a one-time cost at construction and then serves batches every epoch from memory:
+
+~~~{.py}
+dl = RDataLoader(rdf, batch_size=256, load_eager=True)
+~~~
+
 ### Resampling
 
 Correct class imbalance by oversampling the minority or undersampling the majority. You can do this by passing two RDataFrames:
@@ -244,33 +248,3 @@ dl = RDataLoader(rdf,
 for X, y, w in dl.as_torch():
     loss = (loss_fn(model(X), y) * w).mean()
 ~~~
-
-### Eager loading
-
-By default the loader reads data lazily, one chunk of data at a time. For small datasets that fit in memory and will be iterated many times, eager loading pays a one-time cost at construction and then serves every epoch from memory:
-
-~~~{.py}
-dl = RDataLoader(rdf, batch_size=256, load_eager=True)
-~~~
-
-## API Reference
-
-### RDataLoader(rdataframes, ...)
-
-| Argument | Type | Default | Description |
-|---|---|---|---|
-| `rdataframes` | `RDF \| list` | - | One or more RDataFrames to load from |
-| `batch_size` | `int` | `64` | Number of events per batch |
-| `batches_in_memory` | `int` | `10` | Shuffle buffer size in batches |
-| `columns` | `list[str]` | `None` | Branches to load - all if not given |
-| `max_vec_sizes` | `dict` | `None` | Max size per RVec column |
-| `vec_padding` | `float` | `0.0` | Pad value for short RVec entries |
-| `target` | `str \| list` | `None` | Label column(s) - returned as `y` |
-| `weights` | `str` | `""` | Event weight column - returned as `w` |
-| `shuffle` | `bool` | `True` | Randomise event order |
-| `drop_remainder` | `bool` | `True` | Drop last incomplete batch |
-| `set_seed` | `int` | `0` | RNG seed - 0 means random |
-| `load_eager` | `bool` | `False` | Load full dataset into RAM |
-| `sampling_type` | `str` | `""` | `"oversampling"` or `"undersampling"` |
-| `sampling_ratio` | `float` | `1.0` | Minority/majority ratio after resampling |
-| `replacement` | `bool` | `False` | Undersampling with replacement |