Skip to content

Commit bec58f0

Browse files
author
Jonah Ascoli
committed
[tutorials][ML] Add Higgs classification dataloader tutorial
Expand on the df106 tutorial by building a network to classify Higgs data.
1 parent e8df1eb commit bec58f0

6 files changed

Lines changed: 242 additions & 0 deletions

tutorials/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests)
7676
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
7777
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
7878
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
79+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
7980
# df036* and df037* seem to trigger OS errors when trying to delete the
8081
# test files created in the tutorials. It is unclear why.
8182
list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C)
@@ -128,6 +129,7 @@ if (NOT dataframe)
128129
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
129130
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
130131
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
132+
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
131133
# RooFit tutorials depending on RDataFrame
132134
list(APPEND dataframe_veto
133135
roofit/roofit/rf408_RDataFrameToRooFit.C
@@ -937,6 +939,7 @@ if(pyroot)
937939
file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
938940
machine_learning/pytorch/*.py
939941
machine_learning/ml_dataloader_PyTorch.py
942+
machine_learning/ml_dataloader_Higgs_Classification.py
940943
)
941944
file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
942945
machine_learning/tmva101_Training.py
Binary file not shown.
Binary file not shown.
Binary file not shown.

tutorials/machine_learning/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,5 @@
137137
| ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. |
138138
| ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. |
139139
| ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. |
140+
| ml_dataloader_Higgs_Classification | Loading batches of events from different files for a data-normalization workflow. |
140141

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
## \file
2+
## \ingroup tutorial_dataframe
3+
## \notebook -draw
4+
## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
5+
##
6+
## This tutorial is a continuation of the HiggsToFourLeptons tutorial.
7+
## We will build a model to classify the data as Higgs or not Higgs.
8+
##
9+
##
10+
## \macro_image
11+
## \macro_code
12+
## \macro_output
13+
##
14+
## \date June 2026
15+
## \authors Jonah Ascoli (CERN), Martin Foll (CERN), Silia Taider (CERN)
16+
17+
import matplotlib.pyplot as plt
18+
import ROOT
19+
import sklearn.metrics as skl
20+
import torch
21+
from torch import nn
22+
from tqdm import tqdm
23+
24+
data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
25+
df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
26+
df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
27+
df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")
28+
29+
30+
# Classifier model with adjustable hidden layers
31+
class Classifier(nn.Module):
32+
def __init__(
33+
self,
34+
num_features: int,
35+
hidden_layers: list[int],
36+
p: float = 0.2,
37+
use_dropout: bool = False,
38+
use_batchnorm: bool = True,
39+
):
40+
super().__init__()
41+
42+
layers = []
43+
in_dim = num_features
44+
45+
for out_dim in hidden_layers:
46+
block = [nn.Linear(in_dim, out_dim)]
47+
48+
if use_batchnorm:
49+
block.append(nn.BatchNorm1d(out_dim))
50+
51+
block.append(nn.ReLU())
52+
53+
if use_dropout:
54+
block.append(nn.Dropout(p))
55+
56+
layers.append(nn.Sequential(*block))
57+
in_dim = out_dim
58+
59+
self.hidden = nn.Sequential(*layers)
60+
self.output_layer = nn.Linear(in_dim, 1)
61+
62+
def forward(self, x):
63+
x = self.hidden(x)
64+
x = self.output_layer(x)
65+
return torch.sigmoid(x)
66+
67+
68+
batch_size = 1000
69+
batches_in_memory = 1000
70+
drop_remainder = True
71+
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
72+
target = "isHiggsRef"
73+
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
74+
shuffle = True
75+
set_seed = 42
76+
77+
# Normalize the data!
78+
for var in tqdm(columns[:-1], desc="Normalizing the training data..."):
79+
if var == "m4l": # The only non-vector column
80+
mean = df_train.Mean(var).GetValue()
81+
stddev = df_train.StdDev(var).GetValue()
82+
df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
83+
# The validation and testing data should be normalized based on the
84+
# mean and standard deviation calculated from the training data.
85+
df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
86+
df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
87+
else:
88+
# Each vector event has 4 columns, and we need to take a column-wise mean and stddev
89+
means = []
90+
stddevs = []
91+
for i in range(max_vec_sizes[var]):
92+
scalar_column = f"{var}_{i}"
93+
df_train = df_train.Define(scalar_column, f"{var}[{i}]")
94+
means.append(df_train.Mean(scalar_column).GetValue())
95+
stddevs.append(df_train.StdDev(scalar_column).GetValue())
96+
mean_vec = ROOT.RVec("double")(means)
97+
stddev_vec = ROOT.RVec("double")(stddevs)
98+
for i in range(len(stddevs)):
99+
if stddevs[i] == 0:
100+
stddevs[i] = 0.01 # Avoids division by 0
101+
expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
102+
df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
103+
# The validation and testing data should be normalized based on the
104+
# mean and standard deviation calculated from the training data.
105+
df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
106+
df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
107+
108+
train = ROOT.Experimental.ML.RDataLoader(
109+
df_train,
110+
batch_size=batch_size,
111+
batches_in_memory=batches_in_memory,
112+
drop_remainder=drop_remainder,
113+
columns=columns,
114+
target=target,
115+
max_vec_sizes=max_vec_sizes,
116+
shuffle=shuffle,
117+
set_seed=set_seed,
118+
)
119+
val = ROOT.Experimental.ML.RDataLoader(
120+
df_val,
121+
batch_size=batch_size,
122+
batches_in_memory=batches_in_memory,
123+
drop_remainder=drop_remainder,
124+
columns=columns,
125+
target=target,
126+
max_vec_sizes=max_vec_sizes,
127+
shuffle=shuffle,
128+
set_seed=set_seed,
129+
)
130+
test = ROOT.Experimental.ML.RDataLoader(
131+
df_test,
132+
batch_size=batch_size,
133+
batches_in_memory=batches_in_memory,
134+
drop_remainder=drop_remainder,
135+
columns=columns,
136+
target=target,
137+
max_vec_sizes=max_vec_sizes,
138+
shuffle=shuffle,
139+
set_seed=set_seed,
140+
)
141+
142+
143+
num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
144+
# Must be calculated manually since columns is not yet expanded
145+
146+
torch.manual_seed(set_seed)
147+
hidden_layers = [60, 60]
148+
model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
149+
loss_fn = nn.BCELoss()
150+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
151+
152+
epochs = 1000
153+
last_val_losses = [float("inf")] * 6
154+
# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
155+
avg_val_losses = []
156+
pbar = tqdm(range(epochs), desc="Initializing training...")
157+
for epoch in pbar:
158+
# training
159+
model.train()
160+
161+
for i, (x_train, y_train) in enumerate(train.as_torch()):
162+
outputs = model(x_train)
163+
loss = loss_fn(outputs, y_train)
164+
165+
optimizer.zero_grad()
166+
loss.backward()
167+
optimizer.step()
168+
169+
# validation
170+
model.eval()
171+
val_loss = 0
172+
val_correct = 0
173+
val_total = 0
174+
175+
with torch.no_grad():
176+
for j, (x_val, y_val) in enumerate(val.as_torch()):
177+
outputs = model(x_val)
178+
loss = loss_fn(outputs, y_val)
179+
val_loss += loss.item()
180+
181+
preds = (outputs > 0.5).float()
182+
val_correct += (preds == y_val).sum().item()
183+
val_total += y_val.size(0)
184+
185+
avg_val_loss = val_loss / (j + 1)
186+
pbar.set_description(f"Avg. val loss: {avg_val_loss}")
187+
avg_val_losses.append(avg_val_loss)
188+
val_accuracy = val_correct / val_total
189+
del last_val_losses[0]
190+
last_val_losses.append(avg_val_loss)
191+
# Early stopping check
192+
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
193+
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
194+
epochs = epoch + 1
195+
break
196+
197+
# Testing
198+
model.eval()
199+
test_loss = 0
200+
test_correct = 0
201+
test_total = 0
202+
203+
test_preds = []
204+
test_true = []
205+
with torch.no_grad():
206+
for j, (x_test, y_test) in enumerate(test.as_torch()):
207+
outputs = model(x_test)
208+
loss = loss_fn(outputs, y_test)
209+
test_loss += loss.item()
210+
test_preds += outputs
211+
test_true += y_test
212+
213+
preds = (outputs > 0.5).float()
214+
test_correct += (preds == y_test).sum().item()
215+
test_total += y_test.size(0)
216+
217+
avg_test_loss = test_loss / (j + 1)
218+
test_accuracy = test_correct / test_total
219+
220+
print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")
221+
222+
# Analysis
223+
fig = plt.figure()
224+
ax = plt.axes()
225+
ax.plot([i for i in range(epochs)], avg_val_losses)
226+
plt.title("Loss curve")
227+
plt.xlabel("Epoch")
228+
plt.ylabel("Validation loss")
229+
plt.savefig("loss_curve")
230+
231+
fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
232+
fig = plt.figure()
233+
ax = plt.axes()
234+
ax.plot(fpr[:-1], tpr[:-1])
235+
plt.title("ROC curve")
236+
plt.xlabel("False positive rate")
237+
plt.ylabel("True positive rate")
238+
plt.savefig("ROC_curve")

0 commit comments

Comments
 (0)