hpcaitech · pre-commit-ci · Mar 30, 2026 · Mar 30, 2026
@@ -1,35 +1,35 @@
 repos:
 
   - repo: https://github.com/PyCQA/autoflake
-    rev: v2.3.1
+    rev: v2.3.3
     hooks:
       - id: autoflake
         name: autoflake (python)
         args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+    rev: 8.0.1
     hooks:
       - id: isort
         name: sort all imports (python)
         args: ["--profile", "black"] # avoid conflict with black
 
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
+    rev: 26.3.1
     hooks:
     - id: black
       name: black formatter
       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.5
+    rev: v22.1.2
     hooks:
     - id: clang-format
       name: clang formatter
       types_or: [c++, c]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: check-yaml
       - id: check-merge-conflict

@@ -4,6 +4,7 @@
 """
 Initialize new model with updated tokenizer by calculating the mean values from original model
 """
+
 import argparse
 
 import numpy as np

@@ -56,10 +56,8 @@ def tokenize_sft(
     template.messages = []
     for idx, mess in enumerate(messages):
         if mess["from"] != template.roles[idx % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{messages}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{messages}")
         template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 0:
@@ -245,10 +243,8 @@ def tokenize_rlhf(
 
     for idx, mess in enumerate(context):
         if mess["from"] != template.roles[idx % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{context}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{context}")
         template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 1:
@@ -272,18 +268,14 @@ def tokenize_rlhf(
     rejected_continuation = data_point["rejected"]
     for round in range(len(chosen_continuation)):
         if chosen_continuation[round]["from"] != template.roles[(round + 1) % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{chosen_continuation}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{chosen_continuation}")
         chosen.append_message(chosen_continuation[round]["from"], chosen_continuation[round]["content"])
 
     for round in range(len(rejected_continuation)):
         if rejected_continuation[round]["from"] != template.roles[(round + 1) % 2]:
-            raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{rejected_continuation}"
-            )
+            raise ValueError(f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{rejected_continuation}")
         rejected.append_message(rejected_continuation[round]["from"], rejected_continuation[round]["content"])
 
     (
@@ -296,14 +288,14 @@ def tokenize_rlhf(
     ) = (None, None, None, None, None, None)
 
     chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer)
-    (chosen_input_ids, chosen_loss_mask, chosen_label_decode) = (
+    chosen_input_ids, chosen_loss_mask, chosen_label_decode = (
         chosen_data_packed["input_ids"],
         chosen_data_packed["loss_mask"],
         chosen_data_packed["label_decode"],
     )
 
     rejected_data_packed = apply_rlhf_data_format(rejected, tokenizer)
-    (rejected_input_ids, rejected_loss_mask, rejected_label_decode) = (
+    rejected_input_ids, rejected_loss_mask, rejected_label_decode = (
         rejected_data_packed["input_ids"],
         rejected_data_packed["loss_mask"],
         rejected_data_packed["label_decode"],

@@ -17,7 +17,6 @@
 https://github.com/volcengine/verl
 """
 
-
 import json
 
 import torch

@@ -130,7 +130,7 @@ def _train(self, epoch: int):
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
-            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+            input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
                 batch["input_ids"],
                 batch["attention_mask"],
                 batch["loss_mask"],
@@ -279,7 +279,7 @@ def _eval(self, epoch: int):
         )
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, self.device)
-            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+            input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
                 batch["input_ids"],
                 batch["attention_mask"],
                 batch["loss_mask"],

@@ -120,7 +120,7 @@ def _init_optimizer(self):
     def _prepare_model_with_strategy(self, has_optimizer: bool):
         if has_optimizer:
             self._init_optimizer()
-            (self._model, self._optimizer) = self._strategy.prepare((self._model, self._optimizer))
+            self._model, self._optimizer = self._strategy.prepare((self._model, self._optimizer))
         else:
             self._model = self._strategy.prepare(self._model)
 

@@ -81,11 +81,11 @@ def restart(chatbot, txt):
     )
     with gr.Row():
         btn = gr.UploadButton("📁", file_types=["file"], file_count="multiple", size="sm")
-        restart_btn = gr.Button(str("\u21BB"), elem_id="restart-btn", scale=1)
+        restart_btn = gr.Button(str("\u21bb"), elem_id="restart-btn", scale=1)
         txt = gr.Textbox(
             scale=8,
             show_label=False,
-            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21BB to clear loaded files and restart chat",
+            placeholder="Enter text and press enter, or use 📁 to upload files, click \u21bb to clear loaded files and restart chat",
             container=True,
             autofocus=True,
         )

@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import multiprocessing
 import time

@@ -176,7 +176,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
         return best_region
 
     def _is_legal_region(self, cur_chunk_info, chunk_infos):
-        (chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
+        chunk_region_start, chunk_region_end = cur_chunk_info["region"]
         if cur_chunk_info in chunk_infos:
             return False
         if chunk_region_end < chunk_region_start:

@@ -338,10 +338,8 @@ def load_sharded_optimizer(
         # Load param_groups.
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_param_groups = torch.load(param_group_path)
         optimizer.load_param_groups(saved_param_groups)
 

@@ -268,10 +268,8 @@ def load_sharded_optimizer(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {index_file_path} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
+                               Lacking param group file under current directory.")
         id_map = load_param_groups_into_optimizer(optimizer, param_group_path)
 
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()

@@ -86,10 +86,8 @@ def load_sharded_optimizer(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {index_file_path} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
+                               Lacking param group file under current directory.")
         id_map = load_param_groups_into_optimizer(optimizer, param_group_path)
 
         checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()

@@ -690,10 +690,8 @@ def _get_param_id_from_optimizer_param(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_groups = torch.load(param_group_path)
 
         updated_groups = []

@@ -559,10 +559,8 @@ def _get_param_id_from_optimizer_param(
         # Load param_groups
         param_group_path = ckpt_index_file.get_param_group_filename()
         if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
+            raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory.")
         saved_groups = torch.load(param_group_path)
 
         updated_groups = []

@@ -138,7 +138,7 @@ def profile_latency(self, process_group, pg_handler):
         latency_list = []
         for i in range(self.latency_iters):
             nbytes = int(BYTE << i)
-            (t, _) = self._profile(process_group, pg_handler, nbytes)
+            t, _ = self._profile(process_group, pg_handler, nbytes)
             latency_list.append(t)
 
         if latency_list[0] is None:
@@ -157,7 +157,7 @@ def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)):
             process_group: A tuple of global rank of the process group.
             pg_handler: The handler of the process group.
         """
-        (_, bandwidth) = self._profile(process_group, pg_handler, maxbytes)
+        _, bandwidth = self._profile(process_group, pg_handler, maxbytes)
         return bandwidth
 
     def profile_ab(self):

@@ -1,6 +1,6 @@
 """This code is adapted from Alpa
-    https://github.com/alpa-projects/alpa/
-   with some changes. """
+ https://github.com/alpa-projects/alpa/
+with some changes."""
 
 import operator
 from dataclasses import dataclass

@@ -4,6 +4,7 @@
     Implemented a tracer which supports control flow and user-defined meta arguments.
     The implementation is partly inspired HuggingFace's fx tracer
 """
+
 import enum
 import functools
 import inspect

@@ -168,9 +168,7 @@ def recycle(self) -> None:
         """
         Recycle a running sequnce to waiitting list
         """
-        assert (
-            not self.check_finish() and not self.status == RequestStatus.ABORTED
-        ), "The running sequence \
+        assert not self.check_finish() and not self.status == RequestStatus.ABORTED, "The running sequence \
         is already done but it still in running list"
         self.status = RequestStatus.RECYCLED
 

@@ -157,7 +157,7 @@ def forward(ctx: Any, x_gate: torch.Tensor, x_up: torch.Tensor, activation: str
         @custom_bwd
         def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, Tensor, None, None]:
             # restore from ctx
-            (x_gate1, x_gate2, x_up) = ctx.saved_tensors
+            x_gate1, x_gate2, x_up = ctx.saved_tensors
             M, N, BLOCK_SIZE, num_warps = ctx.M, ctx.N, ctx.BLOCK_SIZE, ctx.num_warps
 
             # init grad

@@ -135,8 +135,6 @@ def _call_engine_criterion(engine, outputs, labels):
         elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)):
             raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}")
         else:
-            raise TypeError(
-                f"Expected model outputs and labels to be of type torch.Tensor ' \
+            raise TypeError(f"Expected model outputs and labels to be of type torch.Tensor ' \
                 '(which is auto-converted to tuple), list, tuple, or dict, ' \
-                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)"
-            )
+                'but got {type(outputs)} (model outputs) and {type(labels)} (labels)")
@@ -1,6 +1,7 @@
 # Adapted from https://github.com/ModelTC/lightllm
 
 """Sampling parameters for text generation."""
+
 from typing import List, Optional, Union
 
 _SAMPLING_EPS = 1e-5

@@ -448,7 +448,7 @@ def bloom_attention_forward(
         fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
 
         # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        query_layer, key_layer, value_layer = self._split_heads(fused_qkv)
         batch_size, q_length, H, D_HEAD = query_layer.shape
         k = key_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1
         v = value_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1

@@ -399,7 +399,7 @@ def chatglm_flash_attn_kvcache_forward(
         # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
         mixed_x_layer = self.query_key_value(hidden_states)
         if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+            query_layer, key_layer, value_layer = mixed_x_layer.split(
                 [
                     self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
                     self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
@@ -436,7 +436,7 @@ def chatglm_flash_attn_kvcache_forward(
             )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
             # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3)
         cos, sin = infer_state.position_cos, infer_state.position_sin
 
         chatglm2_rotary_emb_fwd(

@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OpenMoE model."""
+"""PyTorch OpenMoE model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 

@@ -122,7 +122,7 @@ def forward(self, hidden_states, attention_mask):
             "the last dimension is not a multiple of 3, " "cannot be divided into query, key and value"
         )
         partition_size = last_dim_value // 3
-        (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, partition_size, dim=last_dim)
+        query_layer, key_layer, value_layer = torch.split(mixed_x_layer, partition_size, dim=last_dim)
 
         # attention scores: [batch_size, num_heads, sub_seq_len, seq_len]
         output_size = (

@@ -1,6 +1,6 @@
 """This code is from NVIDIA apex:
-      https://github.com/NVIDIA/apex
-   with some changes. """
+   https://github.com/NVIDIA/apex
+with some changes."""
 
 import numbers
 

@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from collections import defaultdict

@@ -6,6 +6,7 @@
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 Licensed under the MIT License.
 """
+
 import torch
 
 from colossalai.utils import get_current_device, multi_tensor_applier

@@ -1,4 +1,4 @@
-""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
+"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
 
 import warnings
 from typing import List
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     """
     Initialize new model with updated tokenizer by calculating the mean values from original model
     """
     import argparse
     import numpy as np
@@ Expand Down @@