Skip to content

Commit e786bd5

Browse files
author
Jonah Ascoli
committed
[tutorials][ML] Add Higgs classification dataloader tutorial
Expand on the df106 tutorial by building a network to classify Higgs data.
1 parent e8df1eb commit e786bd5

6 files changed

Lines changed: 253 additions & 0 deletions

tutorials/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests)
7676
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
7777
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
7878
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
79+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
7980
# df036* and df037* seem to trigger OS errors when trying to delete the
8081
# test files created in the tutorials. It is unclear why.
8182
list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C)
@@ -128,6 +129,7 @@ if (NOT dataframe)
128129
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
129130
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
130131
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
132+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
131133
# RooFit tutorials depending on RDataFrame
132134
list(APPEND dataframe_veto
133135
roofit/roofit/rf408_RDataFrameToRooFit.C
@@ -937,6 +939,7 @@ if(pyroot)
937939
file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
938940
machine_learning/pytorch/*.py
939941
machine_learning/ml_dataloader_PyTorch.py
942+
machine_learning/ml_dataloader_Higgs_Classification.py
940943
)
941944
file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
942945
machine_learning/tmva101_Training.py
Binary file not shown.
Binary file not shown.
Binary file not shown.

tutorials/machine_learning/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,5 @@
137137
| ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. |
138138
| ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. |
139139
| ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. |
140+
| ml_dataloader_Higgs_Classification | Loading batches of events from different files for a data-normalization workflow. |
140141

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
## \file
2+
## \ingroup tutorial_dataframe
3+
## \notebook -draw
4+
## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
5+
##
6+
## This tutorial is a continuation of the HiggsToFourLeptons tutorial.
7+
## We will build a model to classify the data as Higgs or not Higgs.
8+
##
9+
##
10+
## \macro_image
11+
## \macro_code
12+
## \macro_output
13+
##
14+
## \date June 2026
15+
## \authors Jonah Ascoli (CERN), Martin Foll (CERN), Silia Taider (CERN)
16+
17+
import matplotlib.pyplot as plt
18+
import ROOT
19+
import sklearn.metrics as skl
20+
import torch
21+
from torch import nn
22+
23+
print("Loading dataframes...")
24+
data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
25+
df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
26+
df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
27+
df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")
28+
29+
30+
# Classifier model with adjustable hidden layers
31+
class Classifier(nn.Module):
32+
def __init__(
33+
self,
34+
num_features: int,
35+
hidden_layers: list[int],
36+
p: float = 0.2,
37+
use_dropout: bool = False,
38+
use_batchnorm: bool = True,
39+
):
40+
super().__init__()
41+
42+
layers = []
43+
in_dim = num_features
44+
45+
for out_dim in hidden_layers:
46+
block = [nn.Linear(in_dim, out_dim)]
47+
48+
if use_batchnorm:
49+
block.append(nn.BatchNorm1d(out_dim))
50+
51+
block.append(nn.ReLU())
52+
53+
if use_dropout:
54+
block.append(nn.Dropout(p))
55+
56+
layers.append(nn.Sequential(*block))
57+
in_dim = out_dim
58+
59+
self.hidden = nn.Sequential(*layers)
60+
self.output_layer = nn.Linear(in_dim, 1)
61+
62+
def forward(self, x):
63+
x = self.hidden(x)
64+
x = self.output_layer(x)
65+
return torch.sigmoid(x)
66+
67+
68+
batch_size = 1000
69+
batches_in_memory = 1000
70+
drop_remainder = True
71+
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
72+
target = "isHiggsRef"
73+
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
74+
shuffle = True
75+
set_seed = 42
76+
77+
# Normalize the data!
78+
print("Normalizing data...")
79+
for var in columns[:-1]:
80+
if var == "m4l": # The only non-vector column
81+
mean = df_train.Mean(var).GetValue()
82+
stddev = df_train.StdDev(var).GetValue()
83+
df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
84+
# The validation and testing data should be normalized based on the
85+
# mean and standard deviation calculated from the training data.
86+
df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
87+
df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
88+
else:
89+
# Each vector event has 4 columns, and we need to take a column-wise mean and stddev
90+
means = []
91+
stddevs = []
92+
for i in range(max_vec_sizes[var]):
93+
scalar_column = f"{var}_{i}"
94+
df_train = df_train.Define(scalar_column, f"{var}[{i}]")
95+
means.append(df_train.Mean(scalar_column).GetValue())
96+
stddevs.append(df_train.StdDev(scalar_column).GetValue())
97+
mean_vec = ROOT.RVec("double")(means)
98+
stddev_vec = ROOT.RVec("double")(stddevs)
99+
for i in range(len(stddevs)):
100+
if stddevs[i] == 0:
101+
stddevs[i] = 0.01 # Avoids division by 0
102+
expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
103+
df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
104+
# The validation and testing data should be normalized based on the
105+
# mean and standard deviation calculated from the training data.
106+
df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
107+
df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
108+
109+
print("Creating dataloaders...")
110+
train = ROOT.Experimental.ML.RDataLoader(
111+
df_train,
112+
batch_size=batch_size,
113+
batches_in_memory=batches_in_memory,
114+
drop_remainder=drop_remainder,
115+
columns=columns,
116+
target=target,
117+
max_vec_sizes=max_vec_sizes,
118+
shuffle=shuffle,
119+
set_seed=set_seed,
120+
)
121+
val = ROOT.Experimental.ML.RDataLoader(
122+
df_val,
123+
batch_size=batch_size,
124+
batches_in_memory=batches_in_memory,
125+
drop_remainder=drop_remainder,
126+
columns=columns,
127+
target=target,
128+
max_vec_sizes=max_vec_sizes,
129+
shuffle=shuffle,
130+
set_seed=set_seed,
131+
)
132+
test = ROOT.Experimental.ML.RDataLoader(
133+
df_test,
134+
batch_size=batch_size,
135+
batches_in_memory=batches_in_memory,
136+
drop_remainder=drop_remainder,
137+
columns=columns,
138+
target=target,
139+
max_vec_sizes=max_vec_sizes,
140+
shuffle=shuffle,
141+
set_seed=set_seed,
142+
)
143+
144+
145+
num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
146+
# Must be calculated manually since columns is not yet expanded
147+
148+
torch.manual_seed(set_seed)
149+
hidden_layers = [60, 60]
150+
model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
151+
loss_fn = nn.BCELoss()
152+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
153+
154+
155+
def print_epoch_summary(epoch: int, val_loss: float, val_accuracy: float):
156+
print(f"Epoch {epoch} summary ==> Validation loss: {val_loss:.2f}; Validation accuracy: {val_accuracy:.2f}")
157+
158+
159+
epochs = 1000
160+
last_val_losses = [float("inf")] * 6
161+
# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
162+
avg_val_losses = []
163+
print("Starting training...")
164+
for epoch in range(epochs):
165+
# training
166+
model.train()
167+
168+
for i, (x_train, y_train) in enumerate(train.as_torch()):
169+
outputs = model(x_train)
170+
loss = loss_fn(outputs, y_train)
171+
172+
optimizer.zero_grad()
173+
loss.backward()
174+
optimizer.step()
175+
176+
# validation
177+
model.eval()
178+
val_loss = 0
179+
val_correct = 0
180+
val_total = 0
181+
182+
with torch.no_grad():
183+
for j, (x_val, y_val) in enumerate(val.as_torch()):
184+
outputs = model(x_val)
185+
loss = loss_fn(outputs, y_val)
186+
val_loss += loss.item()
187+
188+
preds = (outputs > 0.5).float()
189+
val_correct += (preds == y_val).sum().item()
190+
val_total += y_val.size(0)
191+
192+
avg_val_loss = val_loss / (j + 1)
193+
avg_val_losses.append(avg_val_loss)
194+
val_accuracy = val_correct / val_total
195+
196+
if epoch % 10 == 9:
197+
print_epoch_summary(epoch + 1, val_loss, val_accuracy)
198+
del last_val_losses[0]
199+
last_val_losses.append(avg_val_loss)
200+
# Early stopping check
201+
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
202+
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
203+
epochs = epoch + 1
204+
break
205+
206+
# Testing
207+
model.eval()
208+
test_loss = 0
209+
test_correct = 0
210+
test_total = 0
211+
212+
test_preds = []
213+
test_true = []
214+
with torch.no_grad():
215+
for j, (x_test, y_test) in enumerate(test.as_torch()):
216+
outputs = model(x_test)
217+
loss = loss_fn(outputs, y_test)
218+
test_loss += loss.item()
219+
test_preds += outputs
220+
test_true += y_test
221+
222+
preds = (outputs > 0.5).float()
223+
test_correct += (preds == y_test).sum().item()
224+
test_total += y_test.size(0)
225+
226+
avg_test_loss = test_loss / (j + 1)
227+
test_accuracy = test_correct / test_total
228+
229+
print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
230+
231+
# Analysis
232+
fig = plt.figure()
233+
ax = plt.axes()
234+
ax.plot([i for i in range(epochs)], avg_val_losses)
235+
plt.title("Loss curve")
236+
plt.xlabel("Epoch")
237+
plt.ylabel("Validation loss")
238+
plt.savefig("loss_curve")
239+
print("Loss curve saved to loss_curve.png")
240+
241+
fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
242+
fig = plt.figure()
243+
ax = plt.axes()
244+
ax.plot(fpr[:-1], tpr[:-1])
245+
plt.title("ROC curve")
246+
plt.xlabel("False positive rate")
247+
plt.ylabel("True positive rate")
248+
plt.savefig("ROC_curve")
249+
print("ROC curve saved to ROC_curve.png")

0 commit comments

Comments
 (0)