Formatter and review suggestions from @greptile-apps

timmoon10 · timmoon10 · commit 48cc585b93e7 · 2026-05-16T11:53:39.000Z
Signed-off-by: Tim Moon &lt;tmoon@nvidia.com&gt;
diff --git a/transformer_engine/common/common.h b/transformer_engine/common/common.h
@@ -339,18 +339,14 @@ struct Tensor {
    * If a tensor has dimensions (D1, D2, ..., Dn), it is reinterpreted
    * as a (D1*D2*...*D(n-1), Dn) matrix.
    */
-  size_t flat_first_dim() const {
-    return flat_2d_dims()[0];
-  }
+  size_t flat_first_dim() const { return flat_2d_dims()[0]; }
 
   /*! Matrix width after tensor is flattened to 2D
    *
    * If a tensor has dimensions (D1, D2, ..., Dn), it is reinterpreted
    * as a (D1*D2*...*D(n-1), Dn) matrix.
    */
-  size_t flat_last_dim() const {
-    return flat_2d_dims()[1];
-  }
+  size_t flat_last_dim() const { return flat_2d_dims()[1]; }
 };
 
 struct GroupedTensor {
diff --git a/transformer_engine/common/include/transformer_engine/utils.h b/transformer_engine/common/include/transformer_engine/utils.h
@@ -32,7 +32,7 @@ extern "C" {
  *  \param[in]     stream       CUDA stream for the operation.
  */
 void nvte_load_value_on_device(const void *host_ptr, void *device_ptr, size_t num_bytes,
-                                cudaStream_t stream);
+                               cudaStream_t stream);
 
 /*! \deprecated Use nvte_load_value_on_device instead.
  *
diff --git a/transformer_engine/common/swizzle/swizzle.cu b/transformer_engine/common/swizzle/swizzle.cu
@@ -25,10 +25,11 @@ constexpr int MXFP8_BLOCK_SIZE = 32;
 constexpr int NVFP4_BLOCK_SIZE = 16;
 
 int get_max_dynamic_smem() {
-  auto query_max_smem = [] () -> int {
+  auto query_max_smem = []() -> int {
     int device{0}, max_smem{0};
     NVTE_CHECK_CUDA(cudaGetDevice(&device));
-    NVTE_CHECK_CUDA(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+    NVTE_CHECK_CUDA(
+        cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     return max_smem;
   };
   static int cached_val = query_max_smem();
diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
@@ -98,28 +98,23 @@ void CheckScaleTensorShape(const Tensor &t, const std::string &name) {
       if (t.has_data()) {
         constexpr std::array<size_t, 2> block_shape{1, 32};
         const std::array<size_t, 2> expected{
-          DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[0]),
-          DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[1])
-        };
-        NVTE_CHECK(t.scale_inv.shape.size() == 2
-                   && t.scale_inv.shape[0] == expected[0]
-                   && t.scale_inv.shape[1] == expected[1],
-                   "Tensor \"", name,
-                   "\" has invalid scale_inv shape (expected ", expected,
+            DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[0]),
+            DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[1])};
+        NVTE_CHECK(t.scale_inv.shape.size() == 2 && t.scale_inv.shape[0] == expected[0] &&
+                       t.scale_inv.shape[1] == expected[1],
+                   "Tensor \"", name, "\" has invalid scale_inv shape (expected ", expected,
                    ", got ", t.scale_inv.shape, ")");
       }
       if (t.has_columnwise_data()) {
         constexpr std::array<size_t, 2> block_shape{32, 1};
         const std::array<size_t, 2> expected{
-          DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[1]),
-          DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[0])
-        };
-        NVTE_CHECK(t.columnwise_scale_inv.shape.size() == 2
-                   && t.columnwise_scale_inv.shape[0] == expected[0]
-                   && t.columnwise_scale_inv.shape[1] == expected[1],
-                   "Tensor \"", name,
-                   "\" has invalid columnwise_scale_inv shape (expected ", expected,
-                   ", got ", t.scale_inv.shape, ")");
+            DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[1]),
+            DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[0])};
+        NVTE_CHECK(t.columnwise_scale_inv.shape.size() == 2 &&
+                       t.columnwise_scale_inv.shape[0] == expected[0] &&
+                       t.columnwise_scale_inv.shape[1] == expected[1],
+                   "Tensor \"", name, "\" has invalid columnwise_scale_inv shape (expected ",
+                   expected, ", got ", t.columnwise_scale_inv.shape, ")");
       }
     } else if (t.scaling_mode == NVTE_NVFP4_1D_SCALING) {
       const auto [first_dim, last_dim] = t.flat_2d_dims();
@@ -128,29 +123,24 @@ void CheckScaleTensorShape(const Tensor &t, const std::string &name) {
         constexpr std::array<size_t, 2> block_shape{1, 16};
         constexpr std::array<size_t, 2> block_alignment{128, 4};
         const std::array<size_t, 2> expected{
-          DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[0]),
-          DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[1])
-        };
-        NVTE_CHECK(t.scale_inv.shape.size() == 2
-                   && t.scale_inv.shape[0] == expected[0]
-                   && t.scale_inv.shape[1] == expected[1],
-                   "Tensor \"", name,
-                   "\" has invalid scale_inv shape (expected ", expected,
+            DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[0]), block_alignment[0]),
+            DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[1]), block_alignment[1])};
+        NVTE_CHECK(t.scale_inv.shape.size() == 2 && t.scale_inv.shape[0] == expected[0] &&
+                       t.scale_inv.shape[1] == expected[1],
+                   "Tensor \"", name, "\" has invalid scale_inv shape (expected ", expected,
                    ", got ", t.scale_inv.shape, ")");
       }
       if (t.has_columnwise_data()) {
         constexpr std::array<size_t, 2> block_shape{1, 16};
         constexpr std::array<size_t, 2> block_alignment{128, 4};
         const std::array<size_t, 2> expected{
-          DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[0]), block_alignment[0]),
-          DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[1]), block_alignment[1])
-        };
-        NVTE_CHECK(t.columnwise_scale_inv.shape.size() == 2
-                   && t.columnwise_scale_inv.shape[0] == expected[0]
-                   && t.columnwise_scale_inv.shape[1] == expected[1],
-                   "Tensor \"", name,
-                   "\" has invalid columnwise_scale_inv shape (expected ", expected,
-                   ", got ", t.scale_inv.shape, ")");
+            DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_shape[0]), block_alignment[0]),
+            DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_shape[1]), block_alignment[1])};
+        NVTE_CHECK(t.columnwise_scale_inv.shape.size() == 2 &&
+                       t.columnwise_scale_inv.shape[0] == expected[0] &&
+                       t.columnwise_scale_inv.shape[1] == expected[1],
+                   "Tensor \"", name, "\" has invalid columnwise_scale_inv shape (expected ",
+                   expected, ", got ", t.columnwise_scale_inv.shape, ")");
       }
     }
   }
diff --git a/transformer_engine/common/util/utils.cu b/transformer_engine/common/util/utils.cu
@@ -4,13 +4,12 @@
  * See LICENSE for license information.
  ************************************************************************/
 
+#include <cuda_runtime.h>
 #include <transformer_engine/utils.h>
 
 #include <algorithm>
 #include <cstring>
 
-#include <cuda_runtime.h>
-
 #include "../common.h"
 #include "../util/logging.h"
 
@@ -27,7 +26,7 @@ union Payload {
 };
 
 constexpr size_t block_size = 512;
-constexpr size_t num_blocks = DIVUP(Payload::kMaxBytes / Payload::kVectorSize, block_size);
+constexpr size_t num_blocks = DIVUP(Payload::kMaxVectors, block_size);
 
 __global__ void __launch_bounds__(block_size) kernel(Payload payload, size_t num_bytes, void *out) {
   const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -56,7 +55,8 @@ void nvte_load_value_on_device(const void *host_ptr, void *device_ptr, size_t nu
 
   // Check pointers
   NVTE_CHECK(host_ptr != nullptr, "Attempting to read ", num_bytes, " bytes from a null pointer.");
-  NVTE_CHECK(device_ptr != nullptr, "Attempting to write ", num_bytes, " bytes into a null pointer.");
+  NVTE_CHECK(device_ptr != nullptr, "Attempting to write ", num_bytes,
+             " bytes into a null pointer.");
   NVTE_CHECK(reinterpret_cast<uintptr_t>(device_ptr) % Payload::kVectorSize == 0,
              "Device pointer is not aligned to ", Payload::kVectorSize, " bytes.");
 
@@ -74,6 +74,7 @@ void nvte_load_value_on_device(const void *host_ptr, void *device_ptr, size_t nu
 
 void nvte_convert_pointers_to_tensor(const uint64_t *host_ptrs, NVTETensor output, int64_t count,
                                      cudaStream_t stream) {
+  NVTE_API_CALL(nvte_convert_pointers_to_tensor);
   using namespace transformer_engine;
   Tensor *out_tensor = convertNVTETensorCheck(output);
   nvte_load_value_on_device(host_ptrs, out_tensor->data.dptr,
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -489,10 +489,9 @@ at::Tensor splits_to_offsets(const at::Tensor &first_dims, int64_t logical_last_
 at::Tensor load_data_ptrs_on_device(const std::vector<at::Tensor> &tensors,
                                     const c10::Device &device);
 
-std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_on_device(const std::string &transform_type,
-                                                                                         const std::vector<at::Tensor> &tensors,
-                                                                                         const c10::Device &device);
-
+std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_on_device(
+    const std::string &transform_type, const std::vector<at::Tensor> &tensors,
+    const c10::Device &device);
 
 /***************************************************************************************************
  * Support THD format for Context Parallel
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -489,10 +489,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "Get cublasLt version", py::call_guard<py::gil_scoped_release>());
   m.def("get_cudnn_version", &transformer_engine::pytorch::get_cudnn_version, "Get cuDNN version",
         py::call_guard<py::gil_scoped_release>());
-  m.def("load_data_ptrs_on_device",
-        &transformer_engine::pytorch::load_data_ptrs_on_device,
-        py::arg("tensors"), py::arg("device"),
-        py::call_guard<py::gil_scoped_release>());
+  m.def("load_data_ptrs_on_device", &transformer_engine::pytorch::load_data_ptrs_on_device,
+        py::arg("tensors"), py::arg("device"), py::call_guard<py::gil_scoped_release>());
   m.def("transform_and_load_data_ptrs_on_device",
         &transformer_engine::pytorch::transform_and_load_data_ptrs_on_device,
         py::arg("transform_type"), py::arg("tensors"), py::arg("device"),
diff --git a/transformer_engine/pytorch/csrc/extensions/utils.cpp b/transformer_engine/pytorch/csrc/extensions/utils.cpp
@@ -32,15 +32,14 @@ at::Tensor load_data_ptrs_on_device(const std::vector<at::Tensor> &tensors,
 
   // Load pointers on device
   nvte_load_value_on_device(ptrs_host.data(), ptrs_device.data_ptr(),
-                            tensors.size() * sizeof(uint64_t),
-                            at::cuda::getCurrentCUDAStream());
+                            tensors.size() * sizeof(uint64_t), at::cuda::getCurrentCUDAStream());
 
   return ptrs_device;
 }
 
-std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_on_device(const std::string &transform_type,
-                                                                                         const std::vector<at::Tensor> &tensors,
-                                                                                         const c10::Device &device) {
+std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_on_device(
+    const std::string &transform_type, const std::vector<at::Tensor> &tensors,
+    const c10::Device &device) {
   const size_t num_tensors = tensors.size();
 
   // Trivial cases
@@ -50,9 +49,8 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_o
   }
   if (num_tensors == 0) {
     // No input tensors, return tensor with no elements
-    return {
-      at::empty({int64_t{0}}, at::TensorOptions().dtype(at::kLong).device(device)),
-      std::nullopt};
+    return {at::empty({int64_t{0}}, at::TensorOptions().dtype(at::kLong).device(device)),
+            std::nullopt};
   }
 
   // CUDA stream
@@ -62,9 +60,7 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_o
   const bool uniform_mxfp8_rowwise_swizzle = transform_type == "uniform_mxfp8_rowwise_swizzle";
   const bool uniform_mxfp8_colwise_swizzle = transform_type == "uniform_mxfp8_columnwise_swizzle";
   const bool uniform_nvfp4_swizzle = transform_type == "uniform_nvfp4_swizzle";
-  if (uniform_mxfp8_rowwise_swizzle
-      || uniform_mxfp8_colwise_swizzle
-      || uniform_nvfp4_swizzle) {
+  if (uniform_mxfp8_rowwise_swizzle || uniform_mxfp8_colwise_swizzle || uniform_nvfp4_swizzle) {
     // Tensor format
     NVTEScalingMode scaling_mode = NVTE_INVALID_SCALING;
     if (uniform_mxfp8_rowwise_swizzle || uniform_mxfp8_colwise_swizzle) {
@@ -76,16 +72,16 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_o
     // Data types
     transformer_engine::DType data_dtype, scale_dtype;
     switch (scaling_mode) {
-    case NVTE_MXFP8_1D_SCALING:
-      data_dtype = transformer_engine::DType::kFloat8E4M3;
-      scale_dtype = transformer_engine::DType::kFloat8E8M0;
-      break;
-    case NVTE_NVFP4_1D_SCALING:
-      data_dtype = transformer_engine::DType::kFloat4E2M1;
-      scale_dtype = transformer_engine::DType::kFloat8E4M3;
-      break;
-    default:
-      NVTE_ERROR("Unsupported case.");
+      case NVTE_MXFP8_1D_SCALING:
+        data_dtype = transformer_engine::DType::kFloat8E4M3;
+        scale_dtype = transformer_engine::DType::kFloat8E8M0;
+        break;
+      case NVTE_NVFP4_1D_SCALING:
+        data_dtype = transformer_engine::DType::kFloat4E2M1;
+        scale_dtype = transformer_engine::DType::kFloat8E4M3;
+        break;
+      default:
+        NVTE_ERROR("Unsupported case.");
     }
 
     // Scale shape
@@ -128,8 +124,8 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_o
     for (size_t i = 0; i < num_tensors; ++i) {
       inputs_nvte.emplace_back(scaling_mode);
       outputs_nvte.emplace_back(scaling_mode);
-      auto& input_nvte = inputs_nvte.back();
-      auto& output_nvte = outputs_nvte.back();
+      auto &input_nvte = inputs_nvte.back();
+      auto &output_nvte = outputs_nvte.back();
       output_nvte.set_with_gemm_swizzled_scales(true);
       void *in_scale_ptr = tensors[i].data_ptr();
       void *out_scale_ptr = swizzled_scales_dptr + i * swizzled_scales_stride;
@@ -150,28 +146,26 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> transform_and_load_data_ptrs_o
     std::vector<NVTETensor> inputs_nvte_raw, outputs_nvte_raw;
     inputs_nvte_raw.reserve(num_tensors);
     outputs_nvte_raw.reserve(num_tensors);
-    for (auto& t : inputs_nvte) inputs_nvte_raw.push_back(t.data());
-    for (auto& t : outputs_nvte) outputs_nvte_raw.push_back(t.data());
+    for (auto &t : inputs_nvte) inputs_nvte_raw.push_back(t.data());
+    for (auto &t : outputs_nvte) outputs_nvte_raw.push_back(t.data());
 
     // Launch kernel
     nvte_multi_tensor_swizzle_scaling_factors(inputs_nvte_raw.data(), outputs_nvte_raw.data(),
-                                              inputs_nvte_raw.size(),
-                                              stream);
+                                              inputs_nvte_raw.size(), stream);
 
     // Collect data pointers
     std::vector<uint64_t> ptrs_host;
     ptrs_host.reserve(num_tensors);
     for (size_t i = 0; i < num_tensors; ++i) {
-      ptrs_host.push_back(reinterpret_cast<uintptr_t>(swizzled_scales_dptr
-                                                      + i * swizzled_scales_stride));
+      ptrs_host.push_back(
+          reinterpret_cast<uintptr_t>(swizzled_scales_dptr + i * swizzled_scales_stride));
     }
 
     // Load pointers on device
     auto ptrs_device = at::empty({static_cast<int64_t>(num_tensors)},
                                  at::TensorOptions().dtype(at::kLong).device(device));
     nvte_load_value_on_device(ptrs_host.data(), ptrs_device.data_ptr(),
-                              num_tensors * sizeof(uint64_t),
-                              stream);
+                              num_tensors * sizeof(uint64_t), stream);
 
     return {std::move(ptrs_device), std::move(swizzled_scales)};
   }

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ extern "C" {`
`32`	`32`	`* \param[in] stream CUDA stream for the operation.`
`33`	`33`	`*/`
`34`	`34`	`void nvte_load_value_on_device(const void host_ptr, void device_ptr, size_t num_bytes,`
`35`		`- cudaStream_t stream);`
	`35`	`+ cudaStream_t stream);`
`36`	`36`
`37`	`37`	`/*! \deprecated Use nvte_load_value_on_device instead.`
`38`	`38`	`*`