diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 79decaf95022..70731e28f5bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,35 +1,35 @@
 repos:
 
   - repo: https://github.com/PyCQA/autoflake
-    rev: v2.3.1
+    rev: v2.3.3
     hooks:
       - id: autoflake
         name: autoflake (python)
         args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+    rev: 8.0.1
     hooks:
       - id: isort
         name: sort all imports (python)
         args: ["--profile", "black"] # avoid conflict with black
 
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
+    rev: 26.3.1
     hooks:
     - id: black
       name: black formatter
       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.5
+    rev: v22.1.2
     hooks:
     - id: clang-format
       name: clang formatter
       types_or: [c++, c]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: check-yaml
       - id: check-merge-conflict
diff --git a/applications/Colossal-LLaMA/colossal_llama/model/init_model.py b/applications/Colossal-LLaMA/colossal_llama/model/init_model.py
index f61291f35d04..7391594d9556 100644
--- a/applications/Colossal-LLaMA/colossal_llama/model/init_model.py
+++ b/applications/Colossal-LLaMA/colossal_llama/model/init_model.py
@@ -4,6 +4,7 @@
 """
 Initialize new model with updated tokenizer by calculating the mean values from original model
 """
+
 import argparse
 
 import numpy as np
diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
index 893090edfa30..4c7ed0f36909 100755
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@@ -56,10 +56,8 @@ def tokenize_sft(
     template.messages = []
     for idx, mess in enumerate(messages):
         if mess["from"] != template.roles[idx % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{messages}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{messages}")
         template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 0:
@@ -245,10 +243,8 @@ def tokenize_rlhf(
 
     for idx, mess in enumerate(context):
         if mess["from"] != template.roles[idx % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{context}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{context}")
         template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 1:
@@ -272,18 +268,14 @@ def tokenize_rlhf(
     rejected_continuation = data_point["rejected"]
     for round in range(len(chosen_continuation)):
         if chosen_continuation[round]["from"] != template.roles[(round + 1) % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{chosen_continuation}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{chosen_continuation}")
         chosen.append_message(chosen_continuation[round]["from"], chosen_continuation[round]["content"])
 
     for round in range(len(rejected_continuation)):
         if rejected_continuation[round]["from"] != template.roles[(round + 1) % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{rejected_continuation}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{rejected_continuation}")
         rejected.append_message(rejected_continuation[round]["from"], rejected_continuation[round]["content"])
 
     (
@@ -296,14 +288,14 @@ def tokenize_rlhf(
     ) = (None, None, None, None, None, None)
 
     chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer)
-    (chosen_input_ids, chosen_loss_mask, chosen_label_decode) = (
+    chosen_input_ids, chosen_loss_mask, chosen_label_decode = (
         chosen_data_packed["input_ids"],
         chosen_data_packed["loss_mask"],
         chosen_data_packed["label_decode"],
     )
 
     rejected_data_packed = apply_rlhf_data_format(rejected, tokenizer)
-    (rejected_input_ids, rejected_loss_mask, rejected_label_decode) = (
+    rejected_input_ids, rejected_loss_mask, rejected_label_decode = (
         rejected_data_packed["input_ids"],
         rejected_data_packed["loss_mask"],
         rejected_data_packed["label_decode"],
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index f7a2fb89cadb..a7573d2f201b 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -17,7 +17,6 @@
 https://github.com/volcengine/verl
 """
 
-
 import json
 
 import torch
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
index f87bf53c40cf..afbd4fe10f5c 100755
--- a/applications/ColossalChat/coati/trainer/kto.py
+++ b/applications/ColossalChat/coati/trainer/kto.py
@@ -130,7 +130,7 @@ def _train(self, epoch: int):
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
-            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+            input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
                 batch["input_ids"],
                 batch["attention_mask"],
                 batch["loss_mask"],
@@ -279,7 +279,7 @@ def _eval(self, epoch: int):
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
-            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+            input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
                 batch["input_ids"],
                 batch["attention_mask"],
                 batch["loss_mask"],
diff --git a/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py b/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py
index 8abd83a8b249..3de2e07344b8 100755
--- a/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py
+++ b/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py
@@ -120,7 +120,7 @@ def _init_optimizer(self):
     def _prepare_model_with_strategy(self, has_optimizer: bool):
         if has_optimizer:
             self._init_optimizer()
-            (self._model, self._optimizer) = self._strategy.prepare((self._model, self._optimizer))
+            self._model, self._optimizer = self._strategy.prepare((self._model, self._optimizer))
         else:
             self._model = self._strategy.prepare(self._model)
 
diff --git a/applications/ColossalQA/examples/webui_demo/webui.py b/applications/ColossalQA/examples/webui_demo/webui.py
index 1e34330615b5..5ab5df99fab6 100644
--- a/applications/ColossalQA/examples/webui_demo/webui.py
+++ b/applications/ColossalQA/examples/webui_demo/webui.py
@@ -81,11 +81,11 @@ def restart(chatbot, txt):
     )
     with gr.Row():
         btn = gr.UploadButton("📁", file_types=["file"], file_count="multiple", size="sm")
-        restart_btn = gr.Button(str("\u21BB"), elem_id="restart-btn", scale=1)
+        restart_btn = gr.Button(str("\u21bb"), elem_id="restart-btn", scale=1)
         txt = gr.Textbox(
             scale=8,
             show_label=False,
-            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21BB to clear loaded files and restart chat",
+            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21bb to clear loaded files and restart chat",
             container=True,
             autofocus=True,
         )
diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py
index 088d1acb5177..447a4028e368 100644
--- a/colossalai/auto_parallel/tensor_shard/solver/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py
@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import multiprocessing
 import time
diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py
index 8a60ba681f70..92ef70d4736c 100644
--- a/colossalai/autochunk/select_chunk.py
+++ b/colossalai/autochunk/select_chunk.py
@@ -176,7 +176,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        chunk_region_start, chunk_region_end = cur_chunk_info["region"]
         if cur_chunk_info in chunk_infos:
             return False
         if chunk_region_end < chunk_region_start:
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index a81f9b05d7d7..36eccc94f5eb 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -338,10 +338,8 @@ def load_sharded_optimizer(
         # Load param_groups.
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_param_groups = torch.load(param_group_path)
         optimizer.load_param_groups(saved_param_groups)
 
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 642969be3a68..75ec7a3487a7 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -268,10 +268,8 @@ def load_sharded_optimizer(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {index_file_path} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
+                               Lacking param group file under current directory.")
         id_map = load_param_groups_into_optimizer(optimizer, param_group_path)
 
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py
index 5dfb09248b53..d511ec76ec60 100644
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -86,10 +86,8 @@ def load_sharded_optimizer(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {index_file_path} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
+                               Lacking param group file under current directory.")
         id_map = load_param_groups_into_optimizer(optimizer, param_group_path)
 
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 9d972635214d..9ae29cd5ed02 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -690,10 +690,8 @@ def _get_param_id_from_optimizer_param(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_groups = torch.load(param_group_path)
 
         updated_groups = []
diff --git a/colossalai/checkpoint_io/moe_checkpoint.py b/colossalai/checkpoint_io/moe_checkpoint.py
index 85e36f7c6336..d7e2460dd422 100644
--- a/colossalai/checkpoint_io/moe_checkpoint.py
+++ b/colossalai/checkpoint_io/moe_checkpoint.py
@@ -559,10 +559,8 @@ def _get_param_id_from_optimizer_param(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_groups = torch.load(param_group_path)
 
         updated_groups = []
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index 88520b2a14d0..f3b0150c88fa 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -138,7 +138,7 @@ def profile_latency(self, process_group, pg_handler):
         latency_list = []
         for i in range(self.latency_iters):
             nbytes = int(BYTE << i)
-            (t, _) = self._profile(process_group, pg_handler, nbytes)
+            t, _ = self._profile(process_group, pg_handler, nbytes)
             latency_list.append(t)
 
         if latency_list[0] is None:
@@ -157,7 +157,7 @@ def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)):
             process_group: A tuple of global rank of the process group.
             pg_handler: The handler of the process group.
         """
-        (_, bandwidth) = self._profile(process_group, pg_handler, maxbytes)
+        _, bandwidth = self._profile(process_group, pg_handler, maxbytes)
         return bandwidth
 
     def profile_ab(self):
diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py
index 171d8876201f..a88093182c49 100644
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import operator
 from dataclasses import dataclass
diff --git a/colossalai/fx/tracer/tracer.py b/colossalai/fx/tracer/tracer.py
index d9cb587b5d39..5ad7c410a70a 100644
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -4,6 +4,7 @@
     Implemented a tracer which supports control flow and user-defined meta arguments.
     The implementation is partly inspired HuggingFace's fx tracer
 """
+
 import enum
 import functools
 import inspect
diff --git a/colossalai/inference/struct.py b/colossalai/inference/struct.py
index 65d284296bcb..a0af7d4755e8 100644
--- a/colossalai/inference/struct.py
+++ b/colossalai/inference/struct.py
@@ -168,9 +168,7 @@ def recycle(self) -> None:
         """
         Recycle a running sequnce to waiitting list
         """
-        assert (
-            not self.check_finish() and not self.status == RequestStatus.ABORTED
-        ), "The running sequence \
+        assert not self.check_finish() and not self.status == RequestStatus.ABORTED, "The running sequence \
         is already done but it still in running list"
         self.status = RequestStatus.RECYCLED
 
diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py
index 7a2c7e8fbd74..dcf59ec09129 100644
--- a/colossalai/kernel/triton/llama_act_combine_kernel.py
+++ b/colossalai/kernel/triton/llama_act_combine_kernel.py
@@ -157,7 +157,7 @@ def forward(ctx: Any, x_gate: torch.Tensor, x_up: torch.Tensor, activation: str
         @custom_bwd
         def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, Tensor, None, None]:
             # restore from ctx
-            (x_gate1, x_gate2, x_up) = ctx.saved_tensors
+            x_gate1, x_gate2, x_up = ctx.saved_tensors
             M, N, BLOCK_SIZE, num_warps = ctx.M, ctx.N, ctx.BLOCK_SIZE, ctx.num_warps
 
             # init grad
diff --git a/colossalai/legacy/engine/schedule/_base_schedule.py b/colossalai/legacy/engine/schedule/_base_schedule.py
index 9b2913442225..8e779a308482 100644
--- a/colossalai/legacy/engine/schedule/_base_schedule.py
+++ b/colossalai/legacy/engine/schedule/_base_schedule.py
@@ -135,8 +135,6 @@ def _call_engine_criterion(engine, outputs, labels):
         elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)):
             raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}")
         else:
-            raise TypeError(
-                f"Expected model outputs and labels to be of type torch.Tensor ' \
+            raise TypeError(f"Expected model outputs and labels to be of type torch.Tensor ' \
                 '(which is auto-converted to tuple), list, tuple, or dict, ' \
-                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)"
-            )
+                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)")
diff --git a/colossalai/legacy/inference/dynamic_batching/sampling_params.py b/colossalai/legacy/inference/dynamic_batching/sampling_params.py
index a37a83390021..4c7f4e7f0a99 100644
--- a/colossalai/legacy/inference/dynamic_batching/sampling_params.py
+++ b/colossalai/legacy/inference/dynamic_batching/sampling_params.py
@@ -1,6 +1,7 @@
 # Adapted from https://github.com/ModelTC/lightllm
 
 """Sampling parameters for text generation."""
+
 from typing import List, Optional, Union
 
 _SAMPLING_EPS = 1e-5
diff --git a/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py b/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py
index 74fa5f470bf8..1ff9940cf3a2 100644
--- a/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py
@@ -448,7 +448,7 @@ def bloom_attention_forward(
         fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
 
         # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        query_layer, key_layer, value_layer = self._split_heads(fused_qkv)
         batch_size, q_length, H, D_HEAD = query_layer.shape
         k = key_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1
         v = value_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1
diff --git a/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py b/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py
index b8fe8eb54855..c429c4301f1d 100644
--- a/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py
@@ -399,7 +399,7 @@ def chatglm_flash_attn_kvcache_forward(
         # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+            query_layer, key_layer, value_layer = mixed_x_layer.split(
                 [
                     self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
                     self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
@@ -436,7 +436,7 @@ def chatglm_flash_attn_kvcache_forward(
             )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
             # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3)
         cos, sin = infer_state.position_cos, infer_state.position_sin
 
         chatglm2_rotary_emb_fwd(
diff --git a/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
index 5d6e91765883..b3075a3e450d 100644
--- a/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
+++ b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OpenMoE model."""
+"""PyTorch OpenMoE model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
index 445b7e4cda2a..486b77d121f6 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@@ -122,7 +122,7 @@ def forward(self, hidden_states, attention_mask):
             "the last dimension is not a multiple of 3, " "cannot be divided into query, key and value"
         )
         partition_size = last_dim_value // 3
-        (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, partition_size, dim=last_dim)
+        query_layer, key_layer, value_layer = torch.split(mixed_x_layer, partition_size, dim=last_dim)
 
         # attention scores: [batch_size, num_heads, sub_seq_len, seq_len]
         output_size = (
diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py
index 1db48faee213..5b258ca3e71d 100644
--- a/colossalai/nn/layer/layernorm.py
+++ b/colossalai/nn/layer/layernorm.py
@@ -1,6 +1,6 @@
 """This code is from NVIDIA apex:
-      https://github.com/NVIDIA/apex
-   with some changes. """
+   https://github.com/NVIDIA/apex
+with some changes."""
 
 import numbers
 
diff --git a/colossalai/nn/optimizer/distributed_galore.py b/colossalai/nn/optimizer/distributed_galore.py
index edd119c7f3a9..8c9745255e86 100644
--- a/colossalai/nn/optimizer/distributed_galore.py
+++ b/colossalai/nn/optimizer/distributed_galore.py
@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from collections import defaultdict
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index c12551657318..029dda304fa0 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -6,6 +6,7 @@
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 Licensed under the MIT License.
 """
+
 import torch
 
 from colossalai.utils import get_current_device, multi_tensor_applier
diff --git a/colossalai/nn/optimizer/galore.py b/colossalai/nn/optimizer/galore.py
index 7db97605d47e..c5e191e6a227 100644
--- a/colossalai/nn/optimizer/galore.py
+++ b/colossalai/nn/optimizer/galore.py
@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from typing import List
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 71e96c5b0d3e..4e8236e38b2a 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -545,9 +545,7 @@ def attention(
             RingAttention.ATTN_DONE = torch.cuda.Event()
         if RingAttention.SP_STREAM is None:
             RingAttention.SP_STREAM = torch.cuda.Stream()
-        assert (
-            q.shape[2] == k.shape[2]
-        ), "Q, K and V having different sequence lengths (inference or cross-attn)\
+        assert q.shape[2] == k.shape[2], "Q, K and V having different sequence lengths (inference or cross-attn)\
             is not supported yet in training."
         assert (
             attention_mask_type in RingAttention.SUPPORTED_MASK_TYPES
@@ -719,7 +717,7 @@ def forward(
         # Helper to pass args to FA
         def _forward(q, k, v, causal):
             if version.parse(flash_attn.__version__) > version.parse("2.6.3"):
-                (out, softmax_lse, S_dmask, rng_state) = _flash_attn_forward(
+                out, softmax_lse, S_dmask, rng_state = _flash_attn_forward(
                     q,
                     k,
                     v,
@@ -778,7 +776,7 @@ def _local_ring_forward():
                         # Compute with local KV; no mask
                         kv_block = kv_buffers[0]
                         q_block = q
-                        (block_out[i % 2], block_softmax_lse[i % 2], rng_states[i]) = _forward(  # (T, H, D)  # (H, T)
+                        block_out[i % 2], block_softmax_lse[i % 2], rng_states[i] = _forward(  # (T, H, D)  # (H, T)
                             q_block, kv_block[0], kv_block[1], causal=True
                         )
                     elif i <= local_sp_rank:
@@ -945,7 +943,7 @@ def backward(ctx, dout, _):
         over all ranks for accumulation. We avoid using two streams due to backward using doubled
         buffers and more comm cost.
         """
-        (q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back) = ctx.saved_tensors[:9]
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back = ctx.saved_tensors[:9]
         rng_states = ctx.saved_tensors[9:]
 
         is_packed = ctx.is_packed
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index be13200b5c4f..8ce67dbe9a0b 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -1,4 +1,4 @@
-""" PyTorch ChatGLM model. """
+"""PyTorch ChatGLM model."""
 
 from typing import List, Optional, Tuple
 
@@ -482,7 +482,7 @@ def forward(
 
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+            query_layer, key_layer, value_layer = mixed_x_layer.split(
                 [
                     self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
                     self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
@@ -518,7 +518,7 @@ def forward(
             )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
             # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3)
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
diff --git a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
index 6ae4b06e517a..9ced3712cb9e 100644
--- a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
+++ b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
@@ -444,7 +444,7 @@ def forward(
         # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+            query_layer, key_layer, value_layer = mixed_x_layer.split(
                 [
                     self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
                     self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
@@ -480,7 +480,7 @@ def forward(
             )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
             # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3)
 
         # apply relative positional encoding (rotary embedding)
         if rotary_pos_emb is not None:
diff --git a/examples/community/roberta/preprocessing/get_mask.py b/examples/community/roberta/preprocessing/get_mask.py
index f0ba8fe38501..0a8991dd85cf 100644
--- a/examples/community/roberta/preprocessing/get_mask.py
+++ b/examples/community/roberta/preprocessing/get_mask.py
@@ -34,8 +34,8 @@ def __init__(
         self.do_whole_word_mask = do_whole_word_mask
         self.max_predictions_per_seq = max_predictions_per_seq
         self.vocab_words = list(tokenizer.vocab.keys())
-        self.rec = re.compile("[\u4E00-\u9FA5]")
-        self.whole_rec = re.compile("##[\u4E00-\u9FA5]")
+        self.rec = re.compile("[\u4e00-\u9fa5]")
+        self.whole_rec = re.compile("##[\u4e00-\u9fa5]")
 
         self.mlm_p = 0.15
         self.mlm_mask_p = 0.8
diff --git a/examples/community/roberta/preprocessing/mask.cpp b/examples/community/roberta/preprocessing/mask.cpp
index d44f58eccfc2..428faa220e82 100644
--- a/examples/community/roberta/preprocessing/mask.cpp
+++ b/examples/community/roberta/preprocessing/mask.cpp
@@ -75,15 +75,15 @@ auto get_new_segment(
   return new_segment;
 }
 
-bool startsWith(const std::string &s, const std::string &sub) {
+bool startsWith(const std::string& s, const std::string& sub) {
   return s.find(sub) == 0 ? true : false;
 }
 
 auto create_whole_masked_lm_predictions(
-    std::vector<std::string> &tokens,
-    const std::vector<std::string> &original_tokens,
-    const std::vector<std::string> &vocab_words,
-    std::map<std::string, int> &vocab, const int max_predictions_per_seq,
+    std::vector<std::string>& tokens,
+    const std::vector<std::string>& original_tokens,
+    const std::vector<std::string>& vocab_words,
+    std::map<std::string, int>& vocab, const int max_predictions_per_seq,
     const double masked_lm_prob) {
   // for (auto item : vocab) {
   //     std::cout << "key=" << std::string(py::str(item.first)) << ", "
diff --git a/examples/community/roberta/pretraining/model/deberta_v2.py b/examples/community/roberta/pretraining/model/deberta_v2.py
index c7457942e164..e3871964c503 100644
--- a/examples/community/roberta/pretraining/model/deberta_v2.py
+++ b/examples/community/roberta/pretraining/model/deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa-v2 model."""
+"""PyTorch DeBERTa-v2 model."""
 
 import math
 from collections.abc import Sequence
diff --git a/examples/images/vit/vit_train_demo.py b/examples/images/vit/vit_train_demo.py
index a65f89171a03..72df27c1c855 100644
--- a/examples/images/vit/vit_train_demo.py
+++ b/examples/images/vit/vit_train_demo.py
@@ -126,11 +126,9 @@ def evaluate_model(
     avg_loss = "{:.4f}".format(accum_loss.item())
     accuracy = "{:.4f}".format(accum_correct.item() / total_num.item())
     if coordinator.is_master():
-        print(
-            f"Evaluation result for epoch {epoch + 1}: \
+        print(f"Evaluation result for epoch {epoch + 1}: \
                 average_loss={avg_loss}, \
-                accuracy={accuracy}."
-        )
+                accuracy={accuracy}.")
 
 
 def main():
diff --git a/examples/tutorial/opt/opt/run_clm.py b/examples/tutorial/opt/opt/run_clm.py
index cb62f77e1add..ca0d49011674 100644
--- a/examples/tutorial/opt/opt/run_clm.py
+++ b/examples/tutorial/opt/opt/run_clm.py
@@ -20,6 +20,7 @@
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=text-generation
 """
+
 # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 
 import math
diff --git a/examples/tutorial/sequence_parallel/data/__init__.py b/examples/tutorial/sequence_parallel/data/__init__.py
index 137f3cf0267b..cbc4e65a690a 100644
--- a/examples/tutorial/sequence_parallel/data/__init__.py
+++ b/examples/tutorial/sequence_parallel/data/__init__.py
@@ -18,7 +18,7 @@ def cyclic_iter(iter):
 def build_train_valid_test_data_iterators(
     train_iters, global_batch_size, eval_interval, eval_iters, dataloader_type="single", **kwargs
 ):
-    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+    train_dataloader, valid_dataloader, test_dataloader = (None, None, None)
 
     logger = get_dist_logger()
     logger.info("> building train, validation, and test datasets ...", ranks=[0])
diff --git a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
index afab202e0927..929a5577835a 100644
--- a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
+++ b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
@@ -252,7 +252,7 @@ def build_training_sample(
 
     # Masking.
     max_predictions_per_seq = masked_lm_prob * max_num_tokens
-    (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
+    tokens, masked_positions, masked_labels, _ = create_masked_lm_predictions(
         tokens,
         vocab_id_list,
         vocab_id_to_token_dict,
diff --git a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
index 8ba598529ebc..b65fd13b9e5a 100644
--- a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
+++ b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Dataloaders."""
 
-
 import torch
 
 from colossalai.legacy.context import ParallelMode
diff --git a/extensions/csrc/kernel/arm/cpu_adam_arm.cpp b/extensions/csrc/kernel/arm/cpu_adam_arm.cpp
index a715a2711576..5e295c69209a 100644
--- a/extensions/csrc/kernel/arm/cpu_adam_arm.cpp
+++ b/extensions/csrc/kernel/arm/cpu_adam_arm.cpp
@@ -1,7 +1,7 @@
 #include "cpu_adam_arm.h"
 
-void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_1(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -106,8 +106,8 @@ void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg,
   }
 }
 
-void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_4(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -192,8 +192,8 @@ void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg,
   }
 }
 
-void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg,
-                           void *_exp_avg_sq, size_t _param_size,
+void AdamOptimizer::Step_8(void* _params, void* grads, void* _exp_avg,
+                           void* _exp_avg_sq, size_t _param_size,
                            at::ScalarType param_dtype,
                            at::ScalarType grad_dtype,
                            at::ScalarType exp_avg_dtype,
@@ -279,9 +279,9 @@ void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg,
 
 void AdamOptimizer::step(size_t step, float lr, float beta1, float beta2,
                          float epsilon, float weight_decay,
-                         bool bias_correction, torch::Tensor &params,
-                         torch::Tensor &grads, torch::Tensor &exp_avg,
-                         torch::Tensor &exp_avg_sq, float loss_scale) {
+                         bool bias_correction, torch::Tensor& params,
+                         torch::Tensor& grads, torch::Tensor& exp_avg,
+                         torch::Tensor& exp_avg_sq, float loss_scale) {
   auto params_c = params.contiguous();
   auto grads_c = grads.contiguous();
   auto exp_avg_c = exp_avg.contiguous();
diff --git a/extensions/csrc/kernel/arm/cpu_adam_arm.h b/extensions/csrc/kernel/arm/cpu_adam_arm.h
index d48968e21682..70233be18a42 100644
--- a/extensions/csrc/kernel/arm/cpu_adam_arm.h
+++ b/extensions/csrc/kernel/arm/cpu_adam_arm.h
@@ -11,15 +11,15 @@
 #include <arm_neon.h>
 #define SIMD_WIDTH 4
 
-inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype,
+inline float32x4_t simd_load_offset(const void* ptr, at::ScalarType dtype,
                                     size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float: {
-      auto ptr_f = reinterpret_cast<const float32_t *>(ptr);
+      auto ptr_f = reinterpret_cast<const float32_t*>(ptr);
       return vld1q_f32(ptr_f + offset);
     }
     case at::ScalarType::Half: {
-      auto ptr_h = reinterpret_cast<const float16_t *>(ptr);
+      auto ptr_h = reinterpret_cast<const float16_t*>(ptr);
       return vcvt_f32_f16(vld1_f16(ptr_h + offset));
     }
     // case at::ScalarType::BFloat16: {
@@ -31,20 +31,20 @@ inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype,
       break;
   }
 }
-inline float32x4_t simd_load(void const *ptr, at::ScalarType dtype) {
+inline float32x4_t simd_load(void const* ptr, at::ScalarType dtype) {
   return simd_load_offset(ptr, dtype, 0);
 }
 
-inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data,
+inline void simd_store_offset(void* ptr, at::ScalarType dtype, float32x4_t data,
                               size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float: {
-      auto ptr_f = reinterpret_cast<float32_t *>(ptr);
+      auto ptr_f = reinterpret_cast<float32_t*>(ptr);
       vst1q_f32(ptr_f + offset, data);
       break;
     }
     case at::ScalarType::Half: {
-      auto ptr_h = reinterpret_cast<float16_t *>(ptr);
+      auto ptr_h = reinterpret_cast<float16_t*>(ptr);
       vst1_f16(ptr_h + offset, vcvt_f16_f32(data));
       break;
     }
@@ -59,7 +59,7 @@ inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data,
   }
 }
 
-inline void simd_store(void *ptr, at::ScalarType dtype, float32x4_t data) {
+inline void simd_store(void* ptr, at::ScalarType dtype, float32x4_t data) {
   return simd_store_offset(ptr, dtype, data, 0);
 }
 
@@ -70,14 +70,14 @@ inline float32x4_t simd_set(float value) {
 
 #endif
 
-inline float scalar_load_offset(const void *ptr, at::ScalarType dtype,
+inline float scalar_load_offset(const void* ptr, at::ScalarType dtype,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      return *(reinterpret_cast<const float *>(ptr) + offset);
+      return *(reinterpret_cast<const float*>(ptr) + offset);
     case at::ScalarType::Half:
       return static_cast<float>(
-          *(reinterpret_cast<const at::Half *>(ptr) + offset));
+          *(reinterpret_cast<const at::Half*>(ptr) + offset));
     // case at::ScalarType::BFloat16:
     //   return static_cast<float>(
     //       *(reinterpret_cast<const at::BFloat16 *>(ptr) + offset));
@@ -87,14 +87,14 @@ inline float scalar_load_offset(const void *ptr, at::ScalarType dtype,
   }
 }
 
-inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data,
+inline void scalar_store_offset(void* ptr, at::ScalarType dtype, float data,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      *(reinterpret_cast<float *>(ptr) + offset) = data;
+      *(reinterpret_cast<float*>(ptr) + offset) = data;
       break;
     case at::ScalarType::Half:
-      *(reinterpret_cast<at::Half *>(ptr) + offset) = data;
+      *(reinterpret_cast<at::Half*>(ptr) + offset) = data;
       break;
       // case at::ScalarType::BFloat16:
       //   *(reinterpret_cast<at::BFloat16 *>(ptr) + offset) = data;
@@ -105,13 +105,13 @@ inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data,
   }
 }
 
-inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype,
+inline void* scalar_seek_offset(void* ptr, at::ScalarType dtype,
                                 size_t offset) {
   switch (dtype) {
     case at::ScalarType::Float:
-      return reinterpret_cast<float *>(ptr) + offset;
+      return reinterpret_cast<float*>(ptr) + offset;
     case at::ScalarType::Half:
-      return reinterpret_cast<at::Half *>(ptr) + offset;
+      return reinterpret_cast<at::Half*>(ptr) + offset;
     // case at::ScalarType::BFloat16:
     //   return reinterpret_cast<at::BFloat16 *>(ptr) + offset;
     default:
@@ -120,8 +120,8 @@ inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype,
   }
 }
 #define STEP(SPAN)                                                        \
-  void Step_##SPAN(void *_params, void *grads, void *_exp_avg,            \
-                   void *_exp_avg_sq, size_t _param_size,                 \
+  void Step_##SPAN(void* _params, void* grads, void* _exp_avg,            \
+                   void* _exp_avg_sq, size_t _param_size,                 \
                    at::ScalarType param_dtype, at::ScalarType grad_dtype, \
                    at::ScalarType exp_avg_dtype,                          \
                    at::ScalarType exp_avg_sq_dtype, float loss_scale = -1);
@@ -195,7 +195,7 @@ class AdamOptimizer {
   }
 
   void step(size_t step, float lr, float beta1, float beta2, float epsilon,
-            float weight_decay, bool bias_correction, torch::Tensor &params,
-            torch::Tensor &grads, torch::Tensor &exp_avg,
-            torch::Tensor &exp_avg_sq, float loss_scale);
+            float weight_decay, bool bias_correction, torch::Tensor& params,
+            torch::Tensor& grads, torch::Tensor& exp_avg,
+            torch::Tensor& exp_avg_sq, float loss_scale);
 };
diff --git a/extensions/csrc/kernel/cuda/utils/vec_copy.h b/extensions/csrc/kernel/cuda/utils/vec_copy.h
index 465703a743a8..10423be6b359 100644
--- a/extensions/csrc/kernel/cuda/utils/vec_copy.h
+++ b/extensions/csrc/kernel/cuda/utils/vec_copy.h
@@ -9,36 +9,36 @@ namespace cuda {
 namespace utils {
 
 template <typename T, int VecSize>
-__device__ __inline__ void copy_zero(T *dst) {
+__device__ __inline__ void copy_zero(T* dst) {
   using VT = typename common::VecTypeTrait<T, VecSize>::Type;
-  *(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
+  *(reinterpret_cast<VT*>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
 }
 
 template <typename SrcT, typename DstT, int VecSize>
-__device__ __inline__ void copy(const SrcT *src, DstT *dst) {
+__device__ __inline__ void copy(const SrcT* src, DstT* dst) {
   using SrcVT = typename common::VecTypeTrait<SrcT, VecSize>::Type;
   using DstVT = typename common::VecTypeTrait<DstT, VecSize>::Type;
-  *(reinterpret_cast<DstVT *>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
-      *(reinterpret_cast<const SrcVT *>(src)));
+  *(reinterpret_cast<DstVT*>(dst)) = funcs::CastFunctor<SrcVT, DstVT>()(
+      *(reinterpret_cast<const SrcVT*>(src)));
 }
 
 template <typename T, int VecSize>
-__device__ __inline__ void copy(const T *src, T *dst) {
+__device__ __inline__ void copy(const T* src, T* dst) {
   using VT = typename common::VecTypeTrait<T, VecSize>::Type;
-  *(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
+  *(reinterpret_cast<VT*>(dst)) = *(reinterpret_cast<const VT*>(src));
 }
 
 template <>
-__device__ __inline__ void copy<float, float, 8>(const float *src, float *dst) {
+__device__ __inline__ void copy<float, float, 8>(const float* src, float* dst) {
   // Since the maximum memory alignment length is 128 bits, we choose float4
   // here.
-  *(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
-  *(reinterpret_cast<float4 *>(dst + 4)) =
-      *(reinterpret_cast<const float4 *>(src + 4));
+  *(reinterpret_cast<float4*>(dst)) = *(reinterpret_cast<const float4*>(src));
+  *(reinterpret_cast<float4*>(dst + 4)) =
+      *(reinterpret_cast<const float4*>(src + 4));
 }
 
 template <typename T>
-int get_vec_size(const torch::Tensor &tensor) {
+int get_vec_size(const torch::Tensor& tensor) {
   uint64_t address = reinterpret_cast<uint64_t>(tensor.data_ptr());
   const int max_aligned_size = 128;
   const int dtype_size = sizeof(T) * 8;
diff --git a/extensions/csrc/kernel/x86/cpu_adam.cpp b/extensions/csrc/kernel/x86/cpu_adam.cpp
index be9300c545c2..ebb178533d72 100644
--- a/extensions/csrc/kernel/x86/cpu_adam.cpp
+++ b/extensions/csrc/kernel/x86/cpu_adam.cpp
@@ -32,8 +32,8 @@ SOFTWARE
 
 // C++ interface
 
-void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg,
-                            float *_exp_avg_sq, size_t _param_size,
+void Adam_Optimizer::Step_1(float* _params, float* grads, float* _exp_avg,
+                            float* _exp_avg_sq, size_t _param_size,
                             bool param_half_precision, bool grad_half_precision,
                             bool momentum_half_precision,
                             bool variance_half_precision, float loss_scale) {
@@ -44,10 +44,10 @@ void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg,
   float step_size = -1 * _alpha / _bias_correction1;
   float w_decay = -1 * _alpha * _weight_decay;
 
-  __half *params_cast_h = reinterpret_cast<__half *>(_params);
-  __half *grads_cast_h = reinterpret_cast<__half *>(grads);
-  __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg);
-  __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq);
+  __half* params_cast_h = reinterpret_cast<__half*>(_params);
+  __half* grads_cast_h = reinterpret_cast<__half*>(grads);
+  __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg);
+  __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq);
 
 #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__)
   AVX_Data betta1_4;
@@ -182,17 +182,17 @@ void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg,
   }
 }
 
-void Adam_Optimizer::Step_4(float *_params, float *grads, float *_exp_avg,
-                            float *_exp_avg_sq, size_t _param_size,
+void Adam_Optimizer::Step_4(float* _params, float* grads, float* _exp_avg,
+                            float* _exp_avg_sq, size_t _param_size,
                             bool param_half_precision, bool grad_half_precision,
                             bool momentum_half_precision,
                             bool variance_half_precision, float loss_scale) {
   size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * 4);
 
-  __half *params_cast_h = reinterpret_cast<__half *>(_params);
-  __half *grads_cast_h = reinterpret_cast<__half *>(grads);
-  __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg);
-  __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq);
+  __half* params_cast_h = reinterpret_cast<__half*>(_params);
+  __half* grads_cast_h = reinterpret_cast<__half*>(grads);
+  __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg);
+  __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq);
 
 #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__)
   AVX_Data betta1_4;
@@ -285,29 +285,29 @@ void Adam_Optimizer::Step_4(float *_params, float *grads, float *_exp_avg,
   }
 #endif
   if (_param_size > rounded_size)
-    Step_1((param_half_precision ? (float *)(params_cast_h + rounded_size)
+    Step_1((param_half_precision ? (float*)(params_cast_h + rounded_size)
                                  : _params + rounded_size),
-           (grad_half_precision ? (float *)(grads_cast_h + rounded_size)
+           (grad_half_precision ? (float*)(grads_cast_h + rounded_size)
                                 : grads + rounded_size),
-           (momentum_half_precision ? (float *)(momentum_cast_h + rounded_size)
+           (momentum_half_precision ? (float*)(momentum_cast_h + rounded_size)
                                     : _exp_avg + rounded_size),
-           (variance_half_precision ? (float *)(variance_cast_h + rounded_size)
+           (variance_half_precision ? (float*)(variance_cast_h + rounded_size)
                                     : _exp_avg_sq + rounded_size),
            (_param_size - rounded_size), param_half_precision,
            grad_half_precision, momentum_half_precision,
            variance_half_precision, loss_scale);
 }
 
-void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg,
-                            float *_exp_avg_sq, size_t _param_size,
+void Adam_Optimizer::Step_8(float* _params, float* grads, float* _exp_avg,
+                            float* _exp_avg_sq, size_t _param_size,
                             bool param_half_precision, bool grad_half_precision,
                             bool momentum_half_precision,
                             bool variance_half_precision, float loss_scale) {
   size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * 8);
-  __half *params_cast_h = reinterpret_cast<__half *>(_params);
-  __half *grads_cast_h = reinterpret_cast<__half *>(grads);
-  __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg);
-  __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq);
+  __half* params_cast_h = reinterpret_cast<__half*>(_params);
+  __half* grads_cast_h = reinterpret_cast<__half*>(grads);
+  __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg);
+  __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq);
 
 #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__)
   AVX_Data betta1_4;
@@ -400,13 +400,13 @@ void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg,
   }
 #endif
   if (_param_size > rounded_size)
-    Step_4((param_half_precision ? (float *)(params_cast_h + rounded_size)
+    Step_4((param_half_precision ? (float*)(params_cast_h + rounded_size)
                                  : _params + rounded_size),
-           (grad_half_precision ? (float *)(grads_cast_h + rounded_size)
+           (grad_half_precision ? (float*)(grads_cast_h + rounded_size)
                                 : grads + rounded_size),
-           (momentum_half_precision ? (float *)(momentum_cast_h + rounded_size)
+           (momentum_half_precision ? (float*)(momentum_cast_h + rounded_size)
                                     : _exp_avg + rounded_size),
-           (variance_half_precision ? (float *)(variance_cast_h + rounded_size)
+           (variance_half_precision ? (float*)(variance_cast_h + rounded_size)
                                     : _exp_avg_sq + rounded_size),
            (_param_size - rounded_size), param_half_precision,
            grad_half_precision, momentum_half_precision,
@@ -415,18 +415,18 @@ void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg,
 
 void Adam_Optimizer::step(size_t step, float lr, float beta1, float beta2,
                           float epsilon, float weight_decay,
-                          bool bias_correction, torch::Tensor &params,
-                          torch::Tensor &grads, torch::Tensor &exp_avg,
-                          torch::Tensor &exp_avg_sq, float loss_scale) {
+                          bool bias_correction, torch::Tensor& params,
+                          torch::Tensor& grads, torch::Tensor& exp_avg,
+                          torch::Tensor& exp_avg_sq, float loss_scale) {
   auto params_c = params.contiguous();
   auto grads_c = grads.contiguous();
   auto exp_avg_c = exp_avg.contiguous();
   auto exp_avg_sq_c = exp_avg_sq.contiguous();
 
-  float *params_ptr = (float *)params_c.data_ptr();
-  float *grads_ptr = (float *)grads_c.data_ptr();
-  float *exp_avg_ptr = (float *)exp_avg_c.data_ptr();
-  float *exp_avg_sq_ptr = (float *)exp_avg_sq_c.data_ptr();
+  float* params_ptr = (float*)params_c.data_ptr();
+  float* grads_ptr = (float*)grads_c.data_ptr();
+  float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+  float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
 
   this->IncrementStep(step, beta1, beta2);
   this->update_state(lr, epsilon, weight_decay, bias_correction);
diff --git a/extensions/csrc/kernel/x86/cpu_adam.h b/extensions/csrc/kernel/x86/cpu_adam.h
index 45e1dde6242d..2e32066ee4f7 100644
--- a/extensions/csrc/kernel/x86/cpu_adam.h
+++ b/extensions/csrc/kernel/x86/cpu_adam.h
@@ -49,10 +49,10 @@ SOFTWARE
 #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
 #define SIMD_LOAD_HALF(x) \
-  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
-#define SIMD_STORE_HALF(x, d)                                         \
-  _mm256_storeu_ps((float *)(x), _mm256_castsi256_ps(_mm512_cvtps_ph( \
-                                     d, _MM_FROUND_TO_NEAREST_INT)))
+  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(x)))
+#define SIMD_STORE_HALF(x, d)                                        \
+  _mm256_storeu_ps((float*)(x), _mm256_castsi256_ps(_mm512_cvtps_ph( \
+                                    d, _MM_FROUND_TO_NEAREST_INT)))
 
 #elif defined(__AVX256__) or defined(__AVX2__)
 #define SIMD_WIDTH 8
@@ -65,10 +65,10 @@ SOFTWARE
 #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
 #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
 #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_LOAD_HALF(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
-#define SIMD_STORE_HALF(x, d)                                   \
-  _mm_storeu_ps((float *)(x), _mm_castsi128_ps(_mm256_cvtps_ph( \
-                                  d, _MM_FROUND_TO_NEAREST_INT)))
+#define SIMD_LOAD_HALF(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)(x)))
+#define SIMD_STORE_HALF(x, d)                                  \
+  _mm_storeu_ps((float*)(x), _mm_castsi128_ps(_mm256_cvtps_ph( \
+                                 d, _MM_FROUND_TO_NEAREST_INT)))
 
 #endif
 
@@ -85,7 +85,7 @@ union AVX_Data {
 
 #define STEP(SPAN)                                                            \
   void Step_##SPAN(                                                           \
-      float *_params, float *grads, float *_exp_avg, float *_exp_avg_sq,      \
+      float* _params, float* grads, float* _exp_avg, float* _exp_avg_sq,      \
       size_t _param_size, bool param_half_precision = false,                  \
       bool grad_half_precision = false, bool momentum_half_precision = false, \
       bool variance_half_precision = false, float loss_scale = -1);
@@ -143,8 +143,8 @@ class Adam_Optimizer {
   }
 
 #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__)
-  inline void simd_load(bool is_half, float *ptr, __half *h_ptr,
-                        AVX_Data &data) {
+  inline void simd_load(bool is_half, float* ptr, __half* h_ptr,
+                        AVX_Data& data) {
     if (is_half) {
       data.data = SIMD_LOAD_HALF(h_ptr);
     } else {
@@ -152,8 +152,8 @@ class Adam_Optimizer {
     }
   }
 
-  inline void simd_store(bool is_half, float *ptr, __half *h_ptr,
-                         AVX_Data &data) {
+  inline void simd_store(bool is_half, float* ptr, __half* h_ptr,
+                         AVX_Data& data) {
     if (is_half) {
       SIMD_STORE_HALF(h_ptr, data.data);
     } else {
@@ -163,9 +163,9 @@ class Adam_Optimizer {
 #endif
 
   void step(size_t step, float lr, float beta1, float beta2, float epsilon,
-            float weight_decay, bool bias_correction, torch::Tensor &params,
-            torch::Tensor &grads, torch::Tensor &exp_avg,
-            torch::Tensor &exp_avg_sq, float loss_scale);
+            float weight_decay, bool bias_correction, torch::Tensor& params,
+            torch::Tensor& grads, torch::Tensor& exp_avg,
+            torch::Tensor& exp_avg_sq, float loss_scale);
 
  private:
   float _alpha;
diff --git a/extensions/pybind/layernorm/layer_norm.cpp b/extensions/pybind/layernorm/layer_norm.cpp
index 77c4e38c8150..550f95d158e0 100644
--- a/extensions/pybind/layernorm/layer_norm.cpp
+++ b/extensions/pybind/layernorm/layer_norm.cpp
@@ -11,8 +11,8 @@
 
 namespace {
 
-void compute_n1_n2(at::Tensor input, at::IntArrayRef normalized_shape, int &n1,
-                   int &n2) {
+void compute_n1_n2(at::Tensor input, at::IntArrayRef normalized_shape, int& n1,
+                   int& n2) {
   int idiff = input.ndimension() - normalized_shape.size();
   n2 = 1;
   for (int i = 0; i < (int)normalized_shape.size(); ++i) {
@@ -31,8 +31,8 @@ void check_args(at::IntArrayRef normalized_shape, at::Tensor gamma,
   TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
 }
 
-void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int &n1,
-                int &n2) {
+void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int& n1,
+                int& n2) {
   int64_t normalized_ndim = normalized_shape.size();
 
   if (normalized_ndim < 1) {
@@ -63,16 +63,16 @@ void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int &n1,
 }
 
 void check_args(at::Tensor input, at::IntArrayRef normalized_shape,
-                at::Tensor gamma, at::Tensor beta, int &n1, int &n2) {
+                at::Tensor gamma, at::Tensor beta, int& n1, int& n2) {
   check_args(input, normalized_shape, n1, n2);
   check_args(normalized_shape, gamma, beta);
 }
 }  // namespace
 
-void cuda_layer_norm(at::Tensor *output, at::Tensor *mean, at::Tensor *invvar,
-                     at::Tensor *input, int n1, int n2,
-                     at::IntArrayRef normalized_shape, at::Tensor *gamma,
-                     at::Tensor *beta, double epsilon);
+void cuda_layer_norm(at::Tensor* output, at::Tensor* mean, at::Tensor* invvar,
+                     at::Tensor* input, int n1, int n2,
+                     at::IntArrayRef normalized_shape, at::Tensor* gamma,
+                     at::Tensor* beta, double epsilon);
 
 #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) \
@@ -103,12 +103,12 @@ std::vector<at::Tensor> layer_norm_affine(at::Tensor input,
   return {output, mean, invvar};
 }
 
-void cuda_layer_norm_gradient(at::Tensor *dout, at::Tensor *mean,
-                              at::Tensor *invvar, at::Tensor *input, int n1,
+void cuda_layer_norm_gradient(at::Tensor* dout, at::Tensor* mean,
+                              at::Tensor* invvar, at::Tensor* input, int n1,
                               int n2, at::IntArrayRef normalized_shape,
-                              at::Tensor *gamma, at::Tensor *beta,
-                              double epsilon, at::Tensor *grad_input,
-                              at::Tensor *grad_gamma, at::Tensor *grad_beta);
+                              at::Tensor* gamma, at::Tensor* beta,
+                              double epsilon, at::Tensor* grad_input,
+                              at::Tensor* grad_gamma, at::Tensor* grad_beta);
 
 std::vector<at::Tensor> layer_norm_gradient_affine(
     at::Tensor dout, at::Tensor mean, at::Tensor invvar, at::Tensor input,
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 53dd3c8dd3ba..f5750cf7be0c 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -41,7 +41,7 @@ def exam_state_dict_with_origin(
 ):
     from transformers import BertForSequenceClassification
 
-    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
+    model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     bert_model = model_fn()
 
     enable_flash_attention = True if tp_size > 1 else False
@@ -101,7 +101,7 @@ def exam_state_dict(
     use_async: bool,
     low_cpu_mem_mode: bool,
 ):
-    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
+    model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
     enable_flash_attention = True if tp_size > 1 else False
     enable_fused_normalization = True if tp_size > 1 else False
diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py
index ce4d10322ba5..4cc5e258c85b 100644
--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -22,7 +22,7 @@
 @parameterize("shard", [False, True])
 @parameterize("model_name", ["transformers_llama_for_causal_lm"])
 def exam_torch_load_from_gemini(shard: bool, model_name: str):
-    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
+    model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
     plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14))
     booster = Booster(plugin=plugin)
@@ -88,7 +88,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
 @parameterize("shard", [False, True])
 @parameterize("model_name", ["transformers_gpt"])
 def exam_gemini_load_from_torch(shard: bool, model_name: str):
-    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
+    model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index a338d98f4746..e78e0f6457a9 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -48,9 +48,7 @@
 def exam_state_dict(
     shard: bool, model_name: str, size_per_shard: int, test_config: dict, use_async: bool, low_cpu_mem_mode: bool
 ):
-    (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
-        iter(model_zoo.get_sub_registry(model_name).values())
-    )
+    model_fn, data_gen_fn, output_transform_fn, loss_fn, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = loss_fn
     plugin = HybridParallelPlugin(**test_config)
     booster = Booster(plugin=plugin)
diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
index 6f8eb2ad26cd..4fdd2a42d97e 100644
--- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
+++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
@@ -21,9 +21,7 @@
 @parameterize("model_name", ["transformers_llama_for_causal_lm"])
 @parameterize("plugin_type", ["ddp", "zero", "gemini"])
 def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
-    (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
-        iter(model_zoo.get_sub_registry(model_name).values())
-    )
+    model_fn, data_gen_fn, output_transform_fn, loss_fn, _ = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = loss_fn
 
     if plugin_type == "ddp":
diff --git a/tests/test_tensor/test_mix_gather.py b/tests/test_tensor/test_mix_gather.py
index 6dbbe5de6ff1..100405bef6b4 100644
--- a/tests/test_tensor/test_mix_gather.py
+++ b/tests/test_tensor/test_mix_gather.py
@@ -13,7 +13,7 @@
 
 def check_mix_gather_S0S1(device_mesh, rank):
     tensor_to_check = torch.arange(64).reshape((8, 8)).cuda()
-    (f, b) = (0, 1)
+    f, b = (0, 1)
     f_target_pair = (f, [0])
     b_target_pair = (b, [1])
     gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair)
@@ -89,7 +89,7 @@ def check_two_all_gather_S0S1(device_mesh, rank):
 
 def check_mix_gather_S1S0(device_mesh, rank):
     tensor_to_check = torch.arange(64).reshape((8, 8)).cuda()
-    (f, b) = (0, 1)
+    f, b = (0, 1)
     f_target_pair = (f, [1])
     b_target_pair = (b, [0])
     gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair)
@@ -165,7 +165,7 @@ def check_two_all_gather_S1S0(device_mesh, rank):
 
 def check_mix_gather_S01R(device_mesh, rank):
     tensor_to_check = torch.arange(64).reshape((8, 8)).cuda()
-    (f, b) = (0, 1)
+    f, b = (0, 1)
     f_target_pair = (f, [0, 1])
     b_target_pair = (b, [])
     gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair)
@@ -231,7 +231,7 @@ def check_two_all_gather_S01R(device_mesh, rank):
 def check_mix_gather_RS01(device_mesh, rank):
     tensor_to_check = torch.arange(64).reshape((8, 8)).cuda()
 
-    (f, b) = (0, 1)
+    f, b = (0, 1)
     f_target_pair = (f, [])
     b_target_pair = (b, [0, 1])
     gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair)