Skip to content

Commit 73a662b

Browse files
committed
Change size_average to reduction, and make a note about sums vs means in MSELoss
1 parent 3a4ef4f commit 73a662b

5 files changed

Lines changed: 18 additions & 10 deletions

File tree

README.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -464,8 +464,12 @@ model = torch.nn.Sequential(
464464
).to(device)
465465

466466
# The nn package also contains definitions of popular loss functions; in this
467-
# case we will use Mean Squared Error (MSE) as our loss function.
468-
loss_fn = torch.nn.MSELoss(size_average=False)
467+
# case we will use Mean Squared Error (MSE) as our loss function. Setting
468+
# reduction='sum' means that we are computing the *sum* of squared errors rather
469+
# than the mean; this is for consistency with the examples above where we
470+
# manually compute the loss, but in practice it is more common to use mean
471+
# squared error as a loss by setting reduction='elementwise_mean'.
472+
loss_fn = torch.nn.MSELoss(reduction='sum')
469473

470474
learning_rate = 1e-4
471475
for t in range(500):
@@ -528,7 +532,7 @@ model = torch.nn.Sequential(
528532
torch.nn.ReLU(),
529533
torch.nn.Linear(H, D_out),
530534
)
531-
loss_fn = torch.nn.MSELoss(size_average=False)
535+
loss_fn = torch.nn.MSELoss(reduction='sum')
532536

533537
# Use the optim package to define an Optimizer that will update the weights of
534538
# the model for us. Here we will use Adam; the optim package contains many other
@@ -603,7 +607,7 @@ model = TwoLayerNet(D_in, H, D_out)
603607
# Construct our loss function and an Optimizer. The call to model.parameters()
604608
# in the SGD constructor will contain the learnable parameters of the two
605609
# nn.Linear modules which are members of the model.
606-
loss_fn = torch.nn.MSELoss(size_average=False)
610+
loss_fn = torch.nn.MSELoss(reduction='sum')
607611
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
608612
for t in range(500):
609613
# Forward pass: Compute predicted y by passing x to the model
@@ -683,7 +687,7 @@ model = DynamicNet(D_in, H, D_out)
683687

684688
# Construct our loss function and an Optimizer. Training this strange model with
685689
# vanilla stochastic gradient descent is tough, so we use momentum
686-
criterion = torch.nn.MSELoss(size_average=False)
690+
criterion = torch.nn.MSELoss(reduction='sum')
687691
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
688692
for t in range(500):
689693
# Forward pass: Compute predicted y by passing x to the model

nn/dynamic_net.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def forward(self, x):
5353

5454
# Construct our loss function and an Optimizer. Training this strange model with
5555
# vanilla stochastic gradient descent is tough, so we use momentum
56-
criterion = torch.nn.MSELoss(size_average=False)
56+
criterion = torch.nn.MSELoss(reduction='sum')
5757
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
5858
for t in range(500):
5959
# Forward pass: Compute predicted y by passing x to the model

nn/two_layer_net_module.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def forward(self, x):
4343
# Construct our loss function and an Optimizer. The call to model.parameters()
4444
# in the SGD constructor will contain the learnable parameters of the two
4545
# nn.Linear modules which are members of the model.
46-
loss_fn = torch.nn.MSELoss(size_average=False)
46+
loss_fn = torch.nn.MSELoss(reduction='sum')
4747
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
4848
for t in range(500):
4949
# Forward pass: Compute predicted y by passing x to the model

nn/two_layer_net_nn.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@
3636
).to(device)
3737

3838
# The nn package also contains definitions of popular loss functions; in this
39-
# case we will use Mean Squared Error (MSE) as our loss function.
40-
loss_fn = torch.nn.MSELoss(size_average=False)
39+
# case we will use Mean Squared Error (MSE) as our loss function. Setting
40+
# reduction='sum' means that we are computing the *sum* of squared errors rather
41+
# than the mean; this is for consistency with the examples above where we
42+
# manually compute the loss, but in practice it is more common to use mean
43+
# squared error as a loss by setting reduction='elementwise_mean'.
44+
loss_fn = torch.nn.MSELoss(reduction='sum')
4145

4246
learning_rate = 1e-4
4347
for t in range(500):

nn/two_layer_net_optim.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
torch.nn.ReLU(),
2727
torch.nn.Linear(H, D_out),
2828
)
29-
loss_fn = torch.nn.MSELoss(size_average=False)
29+
loss_fn = torch.nn.MSELoss(reduction='sum')
3030

3131
# Use the optim package to define an Optimizer that will update the weights of
3232
# the model for us. Here we will use Adam; the optim package contains many other

0 commit comments

Comments
 (0)