linting

denera · denera · commit ee517d3592e1 · 2025-12-17T02:50:11.000Z
Signed-off-by: Alp Dener &lt;adener@nvidia.com&gt;
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit 0258951d4d512f4714eb1574496f4d57669b1b93
+Subproject commit be6c079be8aaffa0fc079fcf039887e637c289c7
diff --git a/tests/pytorch/distributed/run_gemm_with_overlap.py b/tests/pytorch/distributed/run_gemm_with_overlap.py
@@ -24,7 +24,7 @@
     MXFP8Quantizer,
 )
 import transformer_engine.pytorch.cpp_extensions as tex
-from transformer_engine.pytorch.cpp_extensions.gemm import get_cublas_workspace_size_bytes
+
 from transformer_engine.pytorch.module.base import fill_userbuffers_buffer_for_all_gather
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
diff --git a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -84,8 +84,11 @@ CommOverlapCore::CommOverlapCore(int64_t nccl_comm_ptr, int tp_rank, int tp_size
       "Comm+GEMM overlap with cuBLASMp backend requires TE to be built with NVTE_WITH_CUBLASMP=1.");
   _with_cublasmp = true;
 
-  nvte_comm_gemm_ctx_create(reinterpret_cast<ncclComm_t>(nccl_comm_ptr), tp_size, tp_rank);
+  _cublasmp_ctx = nvte_comm_gemm_ctx_create(reinterpret_cast<ncclComm_t>(nccl_comm_ptr), tp_size,
+                                            tp_rank);
 
+  _tp_id = tp_rank;
+  _tp_size = tp_size;
   _num_comm_sm = num_comm_sm;
   _is_p2p = is_p2p;
   _atomic_gemm = atomic_gemm;
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -331,7 +331,6 @@ def add_ub(
         comm_priority: int = 0,
         gemm_priority: int = 0,
         pipeline_rs_overlap_first_gemm: bool = False,
-        with_cublasmp: bool = False,
     ) -> None:
         if atomic_gemm:
             warnings.warn(
@@ -506,7 +505,7 @@ def fill_userbuffers_buffer_for_all_gather(
     """
     # cuBlasMp already handles its own buffer filling and quantization factors
     if comm.with_cublasmp():
-        return
+        return local_tensor, local_tensor
 
     # Tensor dimensions
     local_shape = local_tensor.size()

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`MXFP8Quantizer,`
`25`	`25`	`)`
`26`	`26`	`import transformer_engine.pytorch.cpp_extensions as tex`
`27`		`-from transformer_engine.pytorch.cpp_extensions.gemm import get_cublas_workspace_size_bytes`
	`27`	`+`
`28`	`28`	`from transformer_engine.pytorch.module.base import fill_userbuffers_buffer_for_all_gather`
`29`	`29`
`30`	`30`	`warnings.filterwarnings("ignore", category=DeprecationWarning)`