Merge branch 'main' into issues-3859

thesteve0 · web-flow · commit 09ef4a3f2e39 · 2026-05-13T17:50:59.000-04:00
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -25,7 +25,7 @@ pydantic>=2.10
 fastapi
 matplotlib
 librosa
-torch==2.11
+torch==2.12
 torchvision
 torchdata
 networkx
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
@@ -45,13 +45,13 @@
 class Net(nn.Module):
 
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         # 1 input image channel, 6 output channels, 5x5 square convolution
         # kernel
         self.conv1 = nn.Conv2d(1, 6, 5)
         self.conv2 = nn.Conv2d(6, 16, 5)
         # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension 
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
         self.fc2 = nn.Linear(120, 84)
         self.fc3 = nn.Linear(84, 10)
 
@@ -205,7 +205,9 @@ def forward(self, input):
 #
 #
 # Now we shall call ``loss.backward()``, and have a look at conv1's bias
-# gradients before and after the backward.
+# gradients before and after the backward. Since we have not introduced an
+# optimizer yet, we clear the gradients directly on the model. Once using an
+# optimizer, prefer ``optimizer.zero_grad()`` as shown below.
 
 
 net.zero_grad()     # zeroes the gradient buffers of all parameters
@@ -246,7 +248,8 @@ def forward(self, input):
 #
 #     learning_rate = 0.01
 #     for f in net.parameters():
-#         f.data.sub_(f.grad.data * learning_rate)
+#         with torch.no_grad():
+#             f -= f.grad * learning_rate
 #
 # However, as you use neural networks, you want to use various different
 # update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.
diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py
@@ -105,9 +105,9 @@
 #
 
 # We move our tensor to the GPU if available
-if torch.cuda.is_available():
-  tensor = tensor.to('cuda')
-  print(f"Device tensor is stored on: {tensor.device}")
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
+tensor = tensor.to(device)
+print(f"Device tensor is stored on: {tensor.device}")
 
 
 ######################################################################
diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst
@@ -202,6 +202,7 @@ Running the distributed training job
 Here's what the code looks like:
 
 .. code-block:: python
+
    def main(rank, world_size, total_epochs, save_every):
       ddp_setup(rank, world_size)
       dataset, model, optimizer = load_train_objs()
@@ -218,7 +219,6 @@ Here's what the code looks like:
       mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
 
 
-
 Further Reading
 ---------------
 
diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py
@@ -48,12 +48,12 @@ class is a subclass of ``torch.Tensor``, with the special behavior that
 class TinyModel(torch.nn.Module):
     
     def __init__(self):
-        super(TinyModel, self).__init__()
+        super().__init__()
         
         self.linear1 = torch.nn.Linear(100, 200)
         self.activation = torch.nn.ReLU()
         self.linear2 = torch.nn.Linear(200, 10)
-        self.softmax = torch.nn.Softmax()
+        self.softmax = torch.nn.Softmax(dim=1)
     
     def forward(self, x):
         x = self.linear1(x)
@@ -150,7 +150,7 @@ def forward(self, x):
 class LeNet(torch.nn.Module):
 
     def __init__(self):
-        super(LeNet, self).__init__()
+        super().__init__()
         # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
         # kernel
         self.conv1 = torch.nn.Conv2d(1, 6, 5)
@@ -249,7 +249,7 @@ def num_flat_features(self, x):
 class LSTMTagger(torch.nn.Module):
 
     def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
-        super(LSTMTagger, self).__init__()
+        super().__init__()
         self.hidden_dim = hidden_dim
 
         self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
diff --git a/compilers_index.rst b/compilers_index.rst
@@ -166,7 +166,7 @@ control, as well as third-party backend solutions.
    :header: Building a Convolution/Batch Norm fuser in FX
    :card_description: Build a simple FX pass that fuses batch norm into convolution to improve performance during inference.
    :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
-   :link: intermediate/torch_compile_conv_bn_fuser
+   :link: intermediate/torch_compile_conv_bn_fuser.html
    :tags: FX
 
 .. customcarditem::
diff --git a/index.rst b/index.rst
@@ -3,6 +3,7 @@ Welcome to PyTorch Tutorials
 
 **What's new in PyTorch tutorials?**
 
+* `Data Loading Optimization in PyTorch <https://docs.pytorch.org/tutorials/intermediate/intermediate_data_loading_tutorial.html>`__
 * `Distributed Training with Ray Train <https://docs.pytorch.org/tutorials/beginner/distributed_training_with_ray_tutorial.html>`__
 * `Serve PyTorch models at scale with Ray Serve <https://docs.pytorch.org/tutorials/beginner/serving_tutorial.html>`__
 * `Hyperparameter tuning using Ray Tune <https://docs.pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html>`__