Skip to content

Commit 406d354

Browse files
author
Jonah Ascoli
committed
[tutorials][ML] Add Higgs classification dataloader tutorial
Expand on the df106 tutorial by building a network to classify Higgs data.
1 parent e8df1eb commit 406d354

6 files changed

Lines changed: 254 additions & 0 deletions

tutorials/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests)
7676
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
7777
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
7878
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
79+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
7980
# df036* and df037* seem to trigger OS errors when trying to delete the
8081
# test files created in the tutorials. It is unclear why.
8182
list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C)
@@ -128,6 +129,7 @@ if (NOT dataframe)
128129
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
129130
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
130131
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
132+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
131133
# RooFit tutorials depending on RDataFrame
132134
list(APPEND dataframe_veto
133135
roofit/roofit/rf408_RDataFrameToRooFit.C
@@ -937,6 +939,7 @@ if(pyroot)
937939
file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
938940
machine_learning/pytorch/*.py
939941
machine_learning/ml_dataloader_PyTorch.py
942+
machine_learning/ml_dataloader_Higgs_Classification.py
940943
)
941944
file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
942945
machine_learning/tmva101_Training.py
Binary file not shown.
Binary file not shown.
Binary file not shown.

tutorials/machine_learning/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,5 @@
137137
| ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. |
138138
| ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. |
139139
| ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. |
140+
| ml_dataloader_Higgs_Classification | Loading batches of events from different files for a data-normalization workflow. |
140141

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
## \file
2+
## \ingroup tutorial_dataframe
3+
## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
4+
##
5+
## This tutorial is a continuation of the HiggsToFourLeptons tutorial.
6+
## We will build a model to classify the data as Higgs or not Higgs.
7+
##
8+
##
9+
## \macro_code
10+
## \macro_output
11+
##
12+
## \date June 2026
13+
## \authors Jonah Ascoli (CERN), Martin Foll (CERN, University of Oslo (UiO)), Silia Taider (CERN)
14+
15+
import matplotlib.pyplot as plt
16+
import ROOT
17+
import sklearn.metrics as skl
18+
import torch
19+
from matplotlib import use
20+
from torch import nn
21+
22+
print("Loading dataframes...")
23+
data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
24+
df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
25+
df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
26+
df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")
27+
28+
29+
# Classifier model with adjustable hidden layers
30+
class Classifier(nn.Module):
31+
def __init__(
32+
self,
33+
num_features: int,
34+
hidden_layers: list[int],
35+
p: float = 0.2,
36+
use_dropout: bool = False,
37+
use_batchnorm: bool = True,
38+
):
39+
super().__init__()
40+
41+
layers = []
42+
in_dim = num_features
43+
44+
for out_dim in hidden_layers:
45+
block = [nn.Linear(in_dim, out_dim)]
46+
47+
if use_batchnorm:
48+
block.append(nn.BatchNorm1d(out_dim))
49+
50+
block.append(nn.ReLU())
51+
52+
if use_dropout:
53+
block.append(nn.Dropout(p))
54+
55+
layers.append(nn.Sequential(*block))
56+
in_dim = out_dim
57+
58+
self.hidden = nn.Sequential(*layers)
59+
self.output_layer = nn.Linear(in_dim, 1)
60+
61+
def forward(self, x):
62+
x = self.hidden(x)
63+
x = self.output_layer(x)
64+
return torch.sigmoid(x)
65+
66+
67+
batch_size = 1000
68+
batches_in_memory = 1000
69+
drop_remainder = True
70+
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
71+
target = "isHiggsRef"
72+
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
73+
shuffle = True
74+
set_seed = 42
75+
76+
# Normalize the data!
77+
print("Normalizing data...")
78+
for var in columns[:-1]:
79+
if var == "m4l": # The only non-vector column
80+
mean = df_train.Mean(var).GetValue()
81+
stddev = df_train.StdDev(var).GetValue()
82+
df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
83+
# The validation and testing data should be normalized based on the
84+
# mean and standard deviation calculated from the training data.
85+
df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
86+
df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
87+
else:
88+
# Each vector event has 4 columns, and we need to take a column-wise mean and stddev
89+
means = []
90+
stddevs = []
91+
for i in range(max_vec_sizes[var]):
92+
scalar_column = f"{var}_{i}"
93+
df_train = df_train.Define(scalar_column, f"{var}[{i}]")
94+
means.append(df_train.Mean(scalar_column).GetValue())
95+
stddevs.append(df_train.StdDev(scalar_column).GetValue())
96+
mean_vec = ROOT.RVec("double")(means)
97+
stddev_vec = ROOT.RVec("double")(stddevs)
98+
for i in range(len(stddevs)):
99+
if stddevs[i] == 0:
100+
stddevs[i] = 0.01 # Avoids division by 0
101+
expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
102+
df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
103+
# The validation and testing data should be normalized based on the
104+
# mean and standard deviation calculated from the training data.
105+
df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
106+
df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
107+
108+
print("Creating dataloaders...")
109+
train = ROOT.Experimental.ML.RDataLoader(
110+
df_train,
111+
batch_size=batch_size,
112+
batches_in_memory=batches_in_memory,
113+
drop_remainder=drop_remainder,
114+
columns=columns,
115+
target=target,
116+
max_vec_sizes=max_vec_sizes,
117+
shuffle=shuffle,
118+
set_seed=set_seed,
119+
)
120+
val = ROOT.Experimental.ML.RDataLoader(
121+
df_val,
122+
batch_size=batch_size,
123+
batches_in_memory=batches_in_memory,
124+
drop_remainder=drop_remainder,
125+
columns=columns,
126+
target=target,
127+
max_vec_sizes=max_vec_sizes,
128+
shuffle=shuffle,
129+
set_seed=set_seed,
130+
)
131+
test = ROOT.Experimental.ML.RDataLoader(
132+
df_test,
133+
batch_size=batch_size,
134+
batches_in_memory=batches_in_memory,
135+
drop_remainder=drop_remainder,
136+
columns=columns,
137+
target=target,
138+
max_vec_sizes=max_vec_sizes,
139+
shuffle=shuffle,
140+
set_seed=set_seed,
141+
)
142+
143+
# num_features must be calculated manually since the train.training_columns includes condensed vector columns.
144+
# Vector columns are lazily expanded while receiving batches, unless eager_loading is enabled.
145+
num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
146+
147+
torch.manual_seed(set_seed)
148+
hidden_layers = [60, 60]
149+
model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
150+
loss_fn = nn.BCELoss()
151+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
152+
153+
154+
def print_epoch_summary(epoch: int, val_loss: float, val_accuracy: float):
155+
print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")
156+
157+
158+
epochs = 1000
159+
last_val_losses = [float("inf")] * 6
160+
# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
161+
avg_val_losses = []
162+
print("Starting training...")
163+
for epoch in range(epochs):
164+
# training
165+
model.train()
166+
167+
for i, (x_train, y_train) in enumerate(train.as_torch()):
168+
outputs = model(x_train)
169+
loss = loss_fn(outputs, y_train)
170+
171+
optimizer.zero_grad()
172+
loss.backward()
173+
optimizer.step()
174+
175+
# validation
176+
model.eval()
177+
val_loss = 0
178+
val_correct = 0
179+
val_total = 0
180+
181+
with torch.no_grad():
182+
for j, (x_val, y_val) in enumerate(val.as_torch()):
183+
outputs = model(x_val)
184+
loss = loss_fn(outputs, y_val)
185+
val_loss += loss.item()
186+
187+
preds = (outputs > 0.5).float()
188+
val_correct += (preds == y_val).sum().item()
189+
val_total += y_val.size(0)
190+
191+
avg_val_loss = val_loss / (j + 1)
192+
avg_val_losses.append(avg_val_loss)
193+
val_accuracy = val_correct / val_total
194+
195+
if epoch % 10 == 9:
196+
print_epoch_summary(epoch + 1, val_loss, val_accuracy)
197+
del last_val_losses[0]
198+
last_val_losses.append(avg_val_loss)
199+
# Early stopping check
200+
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
201+
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
202+
epochs = epoch + 1
203+
break
204+
205+
# Testing
206+
model.eval()
207+
test_loss = 0
208+
test_correct = 0
209+
test_total = 0
210+
211+
test_preds = []
212+
test_true = []
213+
with torch.no_grad():
214+
for j, (x_test, y_test) in enumerate(test.as_torch()):
215+
outputs = model(x_test)
216+
loss = loss_fn(outputs, y_test)
217+
test_loss += loss.item()
218+
test_preds += outputs.tolist()
219+
test_true += y_test.tolist()
220+
221+
preds = (outputs > 0.5).float()
222+
test_correct += (preds == y_test).sum().item()
223+
test_total += y_test.size(0)
224+
225+
avg_test_loss = test_loss / (j + 1)
226+
test_accuracy = test_correct / test_total
227+
228+
print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
229+
230+
# Analysis
231+
use("Agg") # Non-interactive backend for writing to files
232+
233+
fig = plt.figure()
234+
ax = plt.axes()
235+
ax.plot([i for i in range(epochs)], avg_val_losses)
236+
plt.title("Loss curve")
237+
plt.xlabel("Epoch")
238+
plt.ylabel("Validation loss")
239+
plt.savefig("loss_curve")
240+
print("Loss curve saved to loss_curve.png")
241+
242+
fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
243+
fig = plt.figure()
244+
ax = plt.axes()
245+
ax.plot(fpr[:-1], tpr[:-1])
246+
plt.title("ROC curve")
247+
plt.xlabel("False positive rate")
248+
plt.ylabel("True positive rate")
249+
plt.savefig("ROC_curve")
250+
print("ROC curve saved to ROC_curve.png")

0 commit comments

Comments
 (0)