diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 79decaf95022..70731e28f5bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,35 +1,35 @@ repos: - repo: https://github.com/PyCQA/autoflake - rev: v2.3.1 + rev: v2.3.3 hooks: - id: autoflake name: autoflake (python) args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports'] - repo: https://github.com/pycqa/isort - rev: 5.13.2 + rev: 8.0.1 hooks: - id: isort name: sort all imports (python) args: ["--profile", "black"] # avoid conflict with black - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.10.0 + rev: 26.3.1 hooks: - id: black name: black formatter args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310'] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.5 + rev: v22.1.2 hooks: - id: clang-format name: clang formatter types_or: [c++, c] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-yaml - id: check-merge-conflict diff --git a/applications/Colossal-LLaMA/colossal_llama/model/init_model.py b/applications/Colossal-LLaMA/colossal_llama/model/init_model.py index f61291f35d04..7391594d9556 100644 --- a/applications/Colossal-LLaMA/colossal_llama/model/init_model.py +++ b/applications/Colossal-LLaMA/colossal_llama/model/init_model.py @@ -4,6 +4,7 @@ """ Initialize new model with updated tokenizer by calculating the mean values from original model """ + import argparse import numpy as np diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py index 893090edfa30..4c7ed0f36909 100755 --- a/applications/ColossalChat/coati/dataset/tokenization_utils.py +++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py @@ -56,10 +56,8 @@ def tokenize_sft( template.messages = [] for idx, mess in enumerate(messages): if mess["from"] != template.roles[idx % 2]: - raise ValueError( - f"Message should iterate between user and assistant and starts with a \ - line from the user. Got the following data:\n{messages}" - ) + raise ValueError(f"Message should iterate between user and assistant and starts with a \ + line from the user. Got the following data:\n{messages}") template.append_message(mess["from"], mess["content"]) if len(template.messages) % 2 != 0: @@ -245,10 +243,8 @@ def tokenize_rlhf( for idx, mess in enumerate(context): if mess["from"] != template.roles[idx % 2]: - raise ValueError( - f"Message should iterate between user and assistant and starts with a \ - line from the user. Got the following data:\n{context}" - ) + raise ValueError(f"Message should iterate between user and assistant and starts with a \ + line from the user. Got the following data:\n{context}") template.append_message(mess["from"], mess["content"]) if len(template.messages) % 2 != 1: @@ -272,18 +268,14 @@ def tokenize_rlhf( rejected_continuation = data_point["rejected"] for round in range(len(chosen_continuation)): if chosen_continuation[round]["from"] != template.roles[(round + 1) % 2]: - raise ValueError( - f"Message should iterate between user and assistant and starts with a \ - line from the user. Got the following data:\n{chosen_continuation}" - ) + raise ValueError(f"Message should iterate between user and assistant and starts with a \ + line from the user. Got the following data:\n{chosen_continuation}") chosen.append_message(chosen_continuation[round]["from"], chosen_continuation[round]["content"]) for round in range(len(rejected_continuation)): if rejected_continuation[round]["from"] != template.roles[(round + 1) % 2]: - raise ValueError( - f"Message should iterate between user and assistant and starts with a \ - line from the user. Got the following data:\n{rejected_continuation}" - ) + raise ValueError(f"Message should iterate between user and assistant and starts with a \ + line from the user. Got the following data:\n{rejected_continuation}") rejected.append_message(rejected_continuation[round]["from"], rejected_continuation[round]["content"]) ( @@ -296,14 +288,14 @@ def tokenize_rlhf( ) = (None, None, None, None, None, None) chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer) - (chosen_input_ids, chosen_loss_mask, chosen_label_decode) = ( + chosen_input_ids, chosen_loss_mask, chosen_label_decode = ( chosen_data_packed["input_ids"], chosen_data_packed["loss_mask"], chosen_data_packed["label_decode"], ) rejected_data_packed = apply_rlhf_data_format(rejected, tokenizer) - (rejected_input_ids, rejected_loss_mask, rejected_label_decode) = ( + rejected_input_ids, rejected_loss_mask, rejected_label_decode = ( rejected_data_packed["input_ids"], rejected_data_packed["loss_mask"], rejected_data_packed["label_decode"], diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py index f7a2fb89cadb..a7573d2f201b 100644 --- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py +++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py @@ -17,7 +17,6 @@ https://github.com/volcengine/verl """ - import json import torch diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index f87bf53c40cf..afbd4fe10f5c 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -130,7 +130,7 @@ def _train(self, epoch: int): ) for i, batch in enumerate(self.train_dataloader): batch = to_device(batch, self.device) - (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = ( + input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = ( batch["input_ids"], batch["attention_mask"], batch["loss_mask"], @@ -279,7 +279,7 @@ def _eval(self, epoch: int): ) for i, batch in enumerate(self.train_dataloader): batch = to_device(batch, self.device) - (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = ( + input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = ( batch["input_ids"], batch["attention_mask"], batch["loss_mask"], diff --git a/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py b/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py index 8abd83a8b249..3de2e07344b8 100755 --- a/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py +++ b/applications/ColossalChat/examples/community/ray/train_prompts_on_ray.py @@ -120,7 +120,7 @@ def _init_optimizer(self): def _prepare_model_with_strategy(self, has_optimizer: bool): if has_optimizer: self._init_optimizer() - (self._model, self._optimizer) = self._strategy.prepare((self._model, self._optimizer)) + self._model, self._optimizer = self._strategy.prepare((self._model, self._optimizer)) else: self._model = self._strategy.prepare(self._model) diff --git a/applications/ColossalQA/examples/webui_demo/webui.py b/applications/ColossalQA/examples/webui_demo/webui.py index 1e34330615b5..5ab5df99fab6 100644 --- a/applications/ColossalQA/examples/webui_demo/webui.py +++ b/applications/ColossalQA/examples/webui_demo/webui.py @@ -81,11 +81,11 @@ def restart(chatbot, txt): ) with gr.Row(): btn = gr.UploadButton("📁", file_types=["file"], file_count="multiple", size="sm") - restart_btn = gr.Button(str("\u21BB"), elem_id="restart-btn", scale=1) + restart_btn = gr.Button(str("\u21bb"), elem_id="restart-btn", scale=1) txt = gr.Textbox( scale=8, show_label=False, - placeholder="Enter text and press enter, or use 📁 to upload files, click \u21BB to clear loaded files and restart chat", + placeholder="Enter text and press enter, or use 📁 to upload files, click \u21bb to clear loaded files and restart chat", container=True, autofocus=True, ) diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py index 088d1acb5177..447a4028e368 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/solver.py +++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py @@ -1,6 +1,6 @@ """This code is adapted from Alpa - https://github.com/alpa-projects/alpa/ - with some changes. """ + https://github.com/alpa-projects/alpa/ +with some changes.""" import multiprocessing import time diff --git a/colossalai/autochunk/select_chunk.py b/colossalai/autochunk/select_chunk.py index 8a60ba681f70..92ef70d4736c 100644 --- a/colossalai/autochunk/select_chunk.py +++ b/colossalai/autochunk/select_chunk.py @@ -176,7 +176,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos): return best_region def _is_legal_region(self, cur_chunk_info, chunk_infos): - (chunk_region_start, chunk_region_end) = cur_chunk_info["region"] + chunk_region_start, chunk_region_end = cur_chunk_info["region"] if cur_chunk_info in chunk_infos: return False if chunk_region_end < chunk_region_start: diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index a81f9b05d7d7..36eccc94f5eb 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -338,10 +338,8 @@ def load_sharded_optimizer( # Load param_groups. param_group_path = ckpt_index_file.get_param_group_filename() if param_group_path is None: - raise RuntimeError( - f"Invalid index file path {checkpoint_index_file} for an optimizer. \ - Lacking param group file under current directory." - ) + raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \ + Lacking param group file under current directory.") saved_param_groups = torch.load(param_group_path) optimizer.load_param_groups(saved_param_groups) diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 642969be3a68..75ec7a3487a7 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -268,10 +268,8 @@ def load_sharded_optimizer( # Load param_groups param_group_path = ckpt_index_file.get_param_group_filename() if param_group_path is None: - raise RuntimeError( - f"Invalid index file path {index_file_path} for an optimizer. \ - Lacking param group file under current directory." - ) + raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \ + Lacking param group file under current directory.") id_map = load_param_groups_into_optimizer(optimizer, param_group_path) checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames() diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py index 5dfb09248b53..d511ec76ec60 100644 --- a/colossalai/checkpoint_io/general_checkpoint_io.py +++ b/colossalai/checkpoint_io/general_checkpoint_io.py @@ -86,10 +86,8 @@ def load_sharded_optimizer( # Load param_groups param_group_path = ckpt_index_file.get_param_group_filename() if param_group_path is None: - raise RuntimeError( - f"Invalid index file path {index_file_path} for an optimizer. \ - Lacking param group file under current directory." - ) + raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \ + Lacking param group file under current directory.") id_map = load_param_groups_into_optimizer(optimizer, param_group_path) checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames() diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py index 9d972635214d..9ae29cd5ed02 100644 --- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py +++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py @@ -690,10 +690,8 @@ def _get_param_id_from_optimizer_param( # Load param_groups param_group_path = ckpt_index_file.get_param_group_filename() if param_group_path is None: - raise RuntimeError( - f"Invalid index file path {checkpoint_index_file} for an optimizer. \ - Lacking param group file under current directory." - ) + raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \ + Lacking param group file under current directory.") saved_groups = torch.load(param_group_path) updated_groups = [] diff --git a/colossalai/checkpoint_io/moe_checkpoint.py b/colossalai/checkpoint_io/moe_checkpoint.py index 85e36f7c6336..d7e2460dd422 100644 --- a/colossalai/checkpoint_io/moe_checkpoint.py +++ b/colossalai/checkpoint_io/moe_checkpoint.py @@ -559,10 +559,8 @@ def _get_param_id_from_optimizer_param( # Load param_groups param_group_path = ckpt_index_file.get_param_group_filename() if param_group_path is None: - raise RuntimeError( - f"Invalid index file path {checkpoint_index_file} for an optimizer. \ - Lacking param group file under current directory." - ) + raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \ + Lacking param group file under current directory.") saved_groups = torch.load(param_group_path) updated_groups = [] diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py index 88520b2a14d0..f3b0150c88fa 100644 --- a/colossalai/device/alpha_beta_profiler.py +++ b/colossalai/device/alpha_beta_profiler.py @@ -138,7 +138,7 @@ def profile_latency(self, process_group, pg_handler): latency_list = [] for i in range(self.latency_iters): nbytes = int(BYTE << i) - (t, _) = self._profile(process_group, pg_handler, nbytes) + t, _ = self._profile(process_group, pg_handler, nbytes) latency_list.append(t) if latency_list[0] is None: @@ -157,7 +157,7 @@ def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)): process_group: A tuple of global rank of the process group. pg_handler: The handler of the process group. """ - (_, bandwidth) = self._profile(process_group, pg_handler, maxbytes) + _, bandwidth = self._profile(process_group, pg_handler, maxbytes) return bandwidth def profile_ab(self): diff --git a/colossalai/device/device_mesh.py b/colossalai/device/device_mesh.py index 171d8876201f..a88093182c49 100644 --- a/colossalai/device/device_mesh.py +++ b/colossalai/device/device_mesh.py @@ -1,6 +1,6 @@ """This code is adapted from Alpa - https://github.com/alpa-projects/alpa/ - with some changes. """ + https://github.com/alpa-projects/alpa/ +with some changes.""" import operator from dataclasses import dataclass diff --git a/colossalai/fx/tracer/tracer.py b/colossalai/fx/tracer/tracer.py index d9cb587b5d39..5ad7c410a70a 100644 --- a/colossalai/fx/tracer/tracer.py +++ b/colossalai/fx/tracer/tracer.py @@ -4,6 +4,7 @@ Implemented a tracer which supports control flow and user-defined meta arguments. The implementation is partly inspired HuggingFace's fx tracer """ + import enum import functools import inspect diff --git a/colossalai/inference/struct.py b/colossalai/inference/struct.py index 65d284296bcb..a0af7d4755e8 100644 --- a/colossalai/inference/struct.py +++ b/colossalai/inference/struct.py @@ -168,9 +168,7 @@ def recycle(self) -> None: """ Recycle a running sequnce to waiitting list """ - assert ( - not self.check_finish() and not self.status == RequestStatus.ABORTED - ), "The running sequence \ + assert not self.check_finish() and not self.status == RequestStatus.ABORTED, "The running sequence \ is already done but it still in running list" self.status = RequestStatus.RECYCLED diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py index 7a2c7e8fbd74..dcf59ec09129 100644 --- a/colossalai/kernel/triton/llama_act_combine_kernel.py +++ b/colossalai/kernel/triton/llama_act_combine_kernel.py @@ -157,7 +157,7 @@ def forward(ctx: Any, x_gate: torch.Tensor, x_up: torch.Tensor, activation: str @custom_bwd def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, Tensor, None, None]: # restore from ctx - (x_gate1, x_gate2, x_up) = ctx.saved_tensors + x_gate1, x_gate2, x_up = ctx.saved_tensors M, N, BLOCK_SIZE, num_warps = ctx.M, ctx.N, ctx.BLOCK_SIZE, ctx.num_warps # init grad diff --git a/colossalai/legacy/engine/schedule/_base_schedule.py b/colossalai/legacy/engine/schedule/_base_schedule.py index 9b2913442225..8e779a308482 100644 --- a/colossalai/legacy/engine/schedule/_base_schedule.py +++ b/colossalai/legacy/engine/schedule/_base_schedule.py @@ -135,8 +135,6 @@ def _call_engine_criterion(engine, outputs, labels): elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)): raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}") else: - raise TypeError( - f"Expected model outputs and labels to be of type torch.Tensor ' \ + raise TypeError(f"Expected model outputs and labels to be of type torch.Tensor ' \ '(which is auto-converted to tuple), list, tuple, or dict, ' \ - 'but got {type(outputs)} (model outputs) and {type(labels)} (labels)" - ) + 'but got {type(outputs)} (model outputs) and {type(labels)} (labels)") diff --git a/colossalai/legacy/inference/dynamic_batching/sampling_params.py b/colossalai/legacy/inference/dynamic_batching/sampling_params.py index a37a83390021..4c7f4e7f0a99 100644 --- a/colossalai/legacy/inference/dynamic_batching/sampling_params.py +++ b/colossalai/legacy/inference/dynamic_batching/sampling_params.py @@ -1,6 +1,7 @@ # Adapted from https://github.com/ModelTC/lightllm """Sampling parameters for text generation.""" + from typing import List, Optional, Union _SAMPLING_EPS = 1e-5 diff --git a/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py b/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py index 74fa5f470bf8..1ff9940cf3a2 100644 --- a/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py +++ b/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py @@ -448,7 +448,7 @@ def bloom_attention_forward( fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + query_layer, key_layer, value_layer = self._split_heads(fused_qkv) batch_size, q_length, H, D_HEAD = query_layer.shape k = key_layer.reshape(-1, H, D_HEAD) # batch_size * q_length, H, D_HEAD, q_lenth == 1 v = value_layer.reshape(-1, H, D_HEAD) # batch_size * q_length, H, D_HEAD, q_lenth == 1 diff --git a/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py b/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py index b8fe8eb54855..c429c4301f1d 100644 --- a/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py +++ b/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py @@ -399,7 +399,7 @@ def chatglm_flash_attn_kvcache_forward( # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer = self.query_key_value(hidden_states) if self.multi_query_attention: - (query_layer, key_layer, value_layer) = mixed_x_layer.split( + query_layer, key_layer, value_layer = mixed_x_layer.split( [ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, @@ -436,7 +436,7 @@ def chatglm_flash_attn_kvcache_forward( ) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3) cos, sin = infer_state.position_cos, infer_state.position_sin chatglm2_rotary_emb_fwd( diff --git a/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py index 5d6e91765883..b3075a3e450d 100644 --- a/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py +++ b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py @@ -17,7 +17,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch OpenMoE model.""" +"""PyTorch OpenMoE model.""" + import math from typing import List, Optional, Tuple, Union diff --git a/colossalai/legacy/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py index 445b7e4cda2a..486b77d121f6 100644 --- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py @@ -122,7 +122,7 @@ def forward(self, hidden_states, attention_mask): "the last dimension is not a multiple of 3, " "cannot be divided into query, key and value" ) partition_size = last_dim_value // 3 - (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, partition_size, dim=last_dim) + query_layer, key_layer, value_layer = torch.split(mixed_x_layer, partition_size, dim=last_dim) # attention scores: [batch_size, num_heads, sub_seq_len, seq_len] output_size = ( diff --git a/colossalai/nn/layer/layernorm.py b/colossalai/nn/layer/layernorm.py index 1db48faee213..5b258ca3e71d 100644 --- a/colossalai/nn/layer/layernorm.py +++ b/colossalai/nn/layer/layernorm.py @@ -1,6 +1,6 @@ """This code is from NVIDIA apex: - https://github.com/NVIDIA/apex - with some changes. """ + https://github.com/NVIDIA/apex +with some changes.""" import numbers diff --git a/colossalai/nn/optimizer/distributed_galore.py b/colossalai/nn/optimizer/distributed_galore.py index edd119c7f3a9..8c9745255e86 100644 --- a/colossalai/nn/optimizer/distributed_galore.py +++ b/colossalai/nn/optimizer/distributed_galore.py @@ -1,4 +1,4 @@ -""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py""" +"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py""" import warnings from collections import defaultdict diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index c12551657318..029dda304fa0 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -6,6 +6,7 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85 Licensed under the MIT License. """ + import torch from colossalai.utils import get_current_device, multi_tensor_applier diff --git a/colossalai/nn/optimizer/galore.py b/colossalai/nn/optimizer/galore.py index 7db97605d47e..c5e191e6a227 100644 --- a/colossalai/nn/optimizer/galore.py +++ b/colossalai/nn/optimizer/galore.py @@ -1,4 +1,4 @@ -""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py""" +"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py""" import warnings from typing import List diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py index 71e96c5b0d3e..4e8236e38b2a 100644 --- a/colossalai/shardformer/layer/attn.py +++ b/colossalai/shardformer/layer/attn.py @@ -545,9 +545,7 @@ def attention( RingAttention.ATTN_DONE = torch.cuda.Event() if RingAttention.SP_STREAM is None: RingAttention.SP_STREAM = torch.cuda.Stream() - assert ( - q.shape[2] == k.shape[2] - ), "Q, K and V having different sequence lengths (inference or cross-attn)\ + assert q.shape[2] == k.shape[2], "Q, K and V having different sequence lengths (inference or cross-attn)\ is not supported yet in training." assert ( attention_mask_type in RingAttention.SUPPORTED_MASK_TYPES @@ -719,7 +717,7 @@ def forward( # Helper to pass args to FA def _forward(q, k, v, causal): if version.parse(flash_attn.__version__) > version.parse("2.6.3"): - (out, softmax_lse, S_dmask, rng_state) = _flash_attn_forward( + out, softmax_lse, S_dmask, rng_state = _flash_attn_forward( q, k, v, @@ -778,7 +776,7 @@ def _local_ring_forward(): # Compute with local KV; no mask kv_block = kv_buffers[0] q_block = q - (block_out[i % 2], block_softmax_lse[i % 2], rng_states[i]) = _forward( # (T, H, D) # (H, T) + block_out[i % 2], block_softmax_lse[i % 2], rng_states[i] = _forward( # (T, H, D) # (H, T) q_block, kv_block[0], kv_block[1], causal=True ) elif i <= local_sp_rank: @@ -945,7 +943,7 @@ def backward(ctx, dout, _): over all ranks for accumulation. We avoid using two streams due to backward using doubled buffers and more comm cost. """ - (q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back) = ctx.saved_tensors[:9] + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back = ctx.saved_tensors[:9] rng_states = ctx.saved_tensors[9:] is_packed = ctx.is_packed diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py index be13200b5c4f..8ce67dbe9a0b 100644 --- a/colossalai/shardformer/modeling/chatglm2.py +++ b/colossalai/shardformer/modeling/chatglm2.py @@ -1,4 +1,4 @@ -""" PyTorch ChatGLM model. """ +"""PyTorch ChatGLM model.""" from typing import List, Optional, Tuple @@ -482,7 +482,7 @@ def forward( mixed_x_layer = self.query_key_value(hidden_states) if self.multi_query_attention: - (query_layer, key_layer, value_layer) = mixed_x_layer.split( + query_layer, key_layer, value_layer = mixed_x_layer.split( [ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, @@ -518,7 +518,7 @@ def forward( ) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3) # sp: all-to-all comminucation when introducing sequence parallel if sp_mode == "all_to_all": diff --git a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py index 6ae4b06e517a..9ced3712cb9e 100644 --- a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py +++ b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py @@ -444,7 +444,7 @@ def forward( # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer = self.query_key_value(hidden_states) if self.multi_query_attention: - (query_layer, key_layer, value_layer) = mixed_x_layer.split( + query_layer, key_layer, value_layer = mixed_x_layer.split( [ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, @@ -480,7 +480,7 @@ def forward( ) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3) # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: diff --git a/examples/community/roberta/preprocessing/get_mask.py b/examples/community/roberta/preprocessing/get_mask.py index f0ba8fe38501..0a8991dd85cf 100644 --- a/examples/community/roberta/preprocessing/get_mask.py +++ b/examples/community/roberta/preprocessing/get_mask.py @@ -34,8 +34,8 @@ def __init__( self.do_whole_word_mask = do_whole_word_mask self.max_predictions_per_seq = max_predictions_per_seq self.vocab_words = list(tokenizer.vocab.keys()) - self.rec = re.compile("[\u4E00-\u9FA5]") - self.whole_rec = re.compile("##[\u4E00-\u9FA5]") + self.rec = re.compile("[\u4e00-\u9fa5]") + self.whole_rec = re.compile("##[\u4e00-\u9fa5]") self.mlm_p = 0.15 self.mlm_mask_p = 0.8 diff --git a/examples/community/roberta/preprocessing/mask.cpp b/examples/community/roberta/preprocessing/mask.cpp index d44f58eccfc2..428faa220e82 100644 --- a/examples/community/roberta/preprocessing/mask.cpp +++ b/examples/community/roberta/preprocessing/mask.cpp @@ -75,15 +75,15 @@ auto get_new_segment( return new_segment; } -bool startsWith(const std::string &s, const std::string &sub) { +bool startsWith(const std::string& s, const std::string& sub) { return s.find(sub) == 0 ? true : false; } auto create_whole_masked_lm_predictions( - std::vector &tokens, - const std::vector &original_tokens, - const std::vector &vocab_words, - std::map &vocab, const int max_predictions_per_seq, + std::vector& tokens, + const std::vector& original_tokens, + const std::vector& vocab_words, + std::map& vocab, const int max_predictions_per_seq, const double masked_lm_prob) { // for (auto item : vocab) { // std::cout << "key=" << std::string(py::str(item.first)) << ", " diff --git a/examples/community/roberta/pretraining/model/deberta_v2.py b/examples/community/roberta/pretraining/model/deberta_v2.py index c7457942e164..e3871964c503 100644 --- a/examples/community/roberta/pretraining/model/deberta_v2.py +++ b/examples/community/roberta/pretraining/model/deberta_v2.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch DeBERTa-v2 model.""" +"""PyTorch DeBERTa-v2 model.""" import math from collections.abc import Sequence diff --git a/examples/images/vit/vit_train_demo.py b/examples/images/vit/vit_train_demo.py index a65f89171a03..72df27c1c855 100644 --- a/examples/images/vit/vit_train_demo.py +++ b/examples/images/vit/vit_train_demo.py @@ -126,11 +126,9 @@ def evaluate_model( avg_loss = "{:.4f}".format(accum_loss.item()) accuracy = "{:.4f}".format(accum_correct.item() / total_num.item()) if coordinator.is_master(): - print( - f"Evaluation result for epoch {epoch + 1}: \ + print(f"Evaluation result for epoch {epoch + 1}: \ average_loss={avg_loss}, \ - accuracy={accuracy}." - ) + accuracy={accuracy}.") def main(): diff --git a/examples/tutorial/opt/opt/run_clm.py b/examples/tutorial/opt/opt/run_clm.py index cb62f77e1add..ca0d49011674 100644 --- a/examples/tutorial/opt/opt/run_clm.py +++ b/examples/tutorial/opt/opt/run_clm.py @@ -20,6 +20,7 @@ Here is the full list of checkpoints on the hub that can be fine-tuned by this script: https://huggingface.co/models?filter=text-generation """ + # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. import math diff --git a/examples/tutorial/sequence_parallel/data/__init__.py b/examples/tutorial/sequence_parallel/data/__init__.py index 137f3cf0267b..cbc4e65a690a 100644 --- a/examples/tutorial/sequence_parallel/data/__init__.py +++ b/examples/tutorial/sequence_parallel/data/__init__.py @@ -18,7 +18,7 @@ def cyclic_iter(iter): def build_train_valid_test_data_iterators( train_iters, global_batch_size, eval_interval, eval_iters, dataloader_type="single", **kwargs ): - (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) + train_dataloader, valid_dataloader, test_dataloader = (None, None, None) logger = get_dist_logger() logger.info("> building train, validation, and test datasets ...", ranks=[0]) diff --git a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py index afab202e0927..929a5577835a 100644 --- a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py +++ b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py @@ -252,7 +252,7 @@ def build_training_sample( # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions( + tokens, masked_positions, masked_labels, _ = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, diff --git a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py index 8ba598529ebc..b65fd13b9e5a 100644 --- a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py +++ b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py @@ -14,7 +14,6 @@ # limitations under the License. """Dataloaders.""" - import torch from colossalai.legacy.context import ParallelMode diff --git a/extensions/csrc/kernel/arm/cpu_adam_arm.cpp b/extensions/csrc/kernel/arm/cpu_adam_arm.cpp index a715a2711576..5e295c69209a 100644 --- a/extensions/csrc/kernel/arm/cpu_adam_arm.cpp +++ b/extensions/csrc/kernel/arm/cpu_adam_arm.cpp @@ -1,7 +1,7 @@ #include "cpu_adam_arm.h" -void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg, - void *_exp_avg_sq, size_t _param_size, +void AdamOptimizer::Step_1(void* _params, void* grads, void* _exp_avg, + void* _exp_avg_sq, size_t _param_size, at::ScalarType param_dtype, at::ScalarType grad_dtype, at::ScalarType exp_avg_dtype, @@ -106,8 +106,8 @@ void AdamOptimizer::Step_1(void *_params, void *grads, void *_exp_avg, } } -void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg, - void *_exp_avg_sq, size_t _param_size, +void AdamOptimizer::Step_4(void* _params, void* grads, void* _exp_avg, + void* _exp_avg_sq, size_t _param_size, at::ScalarType param_dtype, at::ScalarType grad_dtype, at::ScalarType exp_avg_dtype, @@ -192,8 +192,8 @@ void AdamOptimizer::Step_4(void *_params, void *grads, void *_exp_avg, } } -void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg, - void *_exp_avg_sq, size_t _param_size, +void AdamOptimizer::Step_8(void* _params, void* grads, void* _exp_avg, + void* _exp_avg_sq, size_t _param_size, at::ScalarType param_dtype, at::ScalarType grad_dtype, at::ScalarType exp_avg_dtype, @@ -279,9 +279,9 @@ void AdamOptimizer::Step_8(void *_params, void *grads, void *_exp_avg, void AdamOptimizer::step(size_t step, float lr, float beta1, float beta2, float epsilon, float weight_decay, - bool bias_correction, torch::Tensor ¶ms, - torch::Tensor &grads, torch::Tensor &exp_avg, - torch::Tensor &exp_avg_sq, float loss_scale) { + bool bias_correction, torch::Tensor& params, + torch::Tensor& grads, torch::Tensor& exp_avg, + torch::Tensor& exp_avg_sq, float loss_scale) { auto params_c = params.contiguous(); auto grads_c = grads.contiguous(); auto exp_avg_c = exp_avg.contiguous(); diff --git a/extensions/csrc/kernel/arm/cpu_adam_arm.h b/extensions/csrc/kernel/arm/cpu_adam_arm.h index d48968e21682..70233be18a42 100644 --- a/extensions/csrc/kernel/arm/cpu_adam_arm.h +++ b/extensions/csrc/kernel/arm/cpu_adam_arm.h @@ -11,15 +11,15 @@ #include #define SIMD_WIDTH 4 -inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype, +inline float32x4_t simd_load_offset(const void* ptr, at::ScalarType dtype, size_t offset) { switch (dtype) { case at::ScalarType::Float: { - auto ptr_f = reinterpret_cast(ptr); + auto ptr_f = reinterpret_cast(ptr); return vld1q_f32(ptr_f + offset); } case at::ScalarType::Half: { - auto ptr_h = reinterpret_cast(ptr); + auto ptr_h = reinterpret_cast(ptr); return vcvt_f32_f16(vld1_f16(ptr_h + offset)); } // case at::ScalarType::BFloat16: { @@ -31,20 +31,20 @@ inline float32x4_t simd_load_offset(const void *ptr, at::ScalarType dtype, break; } } -inline float32x4_t simd_load(void const *ptr, at::ScalarType dtype) { +inline float32x4_t simd_load(void const* ptr, at::ScalarType dtype) { return simd_load_offset(ptr, dtype, 0); } -inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data, +inline void simd_store_offset(void* ptr, at::ScalarType dtype, float32x4_t data, size_t offset) { switch (dtype) { case at::ScalarType::Float: { - auto ptr_f = reinterpret_cast(ptr); + auto ptr_f = reinterpret_cast(ptr); vst1q_f32(ptr_f + offset, data); break; } case at::ScalarType::Half: { - auto ptr_h = reinterpret_cast(ptr); + auto ptr_h = reinterpret_cast(ptr); vst1_f16(ptr_h + offset, vcvt_f16_f32(data)); break; } @@ -59,7 +59,7 @@ inline void simd_store_offset(void *ptr, at::ScalarType dtype, float32x4_t data, } } -inline void simd_store(void *ptr, at::ScalarType dtype, float32x4_t data) { +inline void simd_store(void* ptr, at::ScalarType dtype, float32x4_t data) { return simd_store_offset(ptr, dtype, data, 0); } @@ -70,14 +70,14 @@ inline float32x4_t simd_set(float value) { #endif -inline float scalar_load_offset(const void *ptr, at::ScalarType dtype, +inline float scalar_load_offset(const void* ptr, at::ScalarType dtype, size_t offset) { switch (dtype) { case at::ScalarType::Float: - return *(reinterpret_cast(ptr) + offset); + return *(reinterpret_cast(ptr) + offset); case at::ScalarType::Half: return static_cast( - *(reinterpret_cast(ptr) + offset)); + *(reinterpret_cast(ptr) + offset)); // case at::ScalarType::BFloat16: // return static_cast( // *(reinterpret_cast(ptr) + offset)); @@ -87,14 +87,14 @@ inline float scalar_load_offset(const void *ptr, at::ScalarType dtype, } } -inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data, +inline void scalar_store_offset(void* ptr, at::ScalarType dtype, float data, size_t offset) { switch (dtype) { case at::ScalarType::Float: - *(reinterpret_cast(ptr) + offset) = data; + *(reinterpret_cast(ptr) + offset) = data; break; case at::ScalarType::Half: - *(reinterpret_cast(ptr) + offset) = data; + *(reinterpret_cast(ptr) + offset) = data; break; // case at::ScalarType::BFloat16: // *(reinterpret_cast(ptr) + offset) = data; @@ -105,13 +105,13 @@ inline void scalar_store_offset(void *ptr, at::ScalarType dtype, float data, } } -inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype, +inline void* scalar_seek_offset(void* ptr, at::ScalarType dtype, size_t offset) { switch (dtype) { case at::ScalarType::Float: - return reinterpret_cast(ptr) + offset; + return reinterpret_cast(ptr) + offset; case at::ScalarType::Half: - return reinterpret_cast(ptr) + offset; + return reinterpret_cast(ptr) + offset; // case at::ScalarType::BFloat16: // return reinterpret_cast(ptr) + offset; default: @@ -120,8 +120,8 @@ inline void *scalar_seek_offset(void *ptr, at::ScalarType dtype, } } #define STEP(SPAN) \ - void Step_##SPAN(void *_params, void *grads, void *_exp_avg, \ - void *_exp_avg_sq, size_t _param_size, \ + void Step_##SPAN(void* _params, void* grads, void* _exp_avg, \ + void* _exp_avg_sq, size_t _param_size, \ at::ScalarType param_dtype, at::ScalarType grad_dtype, \ at::ScalarType exp_avg_dtype, \ at::ScalarType exp_avg_sq_dtype, float loss_scale = -1); @@ -195,7 +195,7 @@ class AdamOptimizer { } void step(size_t step, float lr, float beta1, float beta2, float epsilon, - float weight_decay, bool bias_correction, torch::Tensor ¶ms, - torch::Tensor &grads, torch::Tensor &exp_avg, - torch::Tensor &exp_avg_sq, float loss_scale); + float weight_decay, bool bias_correction, torch::Tensor& params, + torch::Tensor& grads, torch::Tensor& exp_avg, + torch::Tensor& exp_avg_sq, float loss_scale); }; diff --git a/extensions/csrc/kernel/cuda/utils/vec_copy.h b/extensions/csrc/kernel/cuda/utils/vec_copy.h index 465703a743a8..10423be6b359 100644 --- a/extensions/csrc/kernel/cuda/utils/vec_copy.h +++ b/extensions/csrc/kernel/cuda/utils/vec_copy.h @@ -9,36 +9,36 @@ namespace cuda { namespace utils { template -__device__ __inline__ void copy_zero(T *dst) { +__device__ __inline__ void copy_zero(T* dst) { using VT = typename common::VecTypeTrait::Type; - *(reinterpret_cast(dst)) = funcs::CastFunctor()(0.0f); + *(reinterpret_cast(dst)) = funcs::CastFunctor()(0.0f); } template -__device__ __inline__ void copy(const SrcT *src, DstT *dst) { +__device__ __inline__ void copy(const SrcT* src, DstT* dst) { using SrcVT = typename common::VecTypeTrait::Type; using DstVT = typename common::VecTypeTrait::Type; - *(reinterpret_cast(dst)) = funcs::CastFunctor()( - *(reinterpret_cast(src))); + *(reinterpret_cast(dst)) = funcs::CastFunctor()( + *(reinterpret_cast(src))); } template -__device__ __inline__ void copy(const T *src, T *dst) { +__device__ __inline__ void copy(const T* src, T* dst) { using VT = typename common::VecTypeTrait::Type; - *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); + *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); } template <> -__device__ __inline__ void copy(const float *src, float *dst) { +__device__ __inline__ void copy(const float* src, float* dst) { // Since the maximum memory alignment length is 128 bits, we choose float4 // here. - *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); - *(reinterpret_cast(dst + 4)) = - *(reinterpret_cast(src + 4)); + *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); + *(reinterpret_cast(dst + 4)) = + *(reinterpret_cast(src + 4)); } template -int get_vec_size(const torch::Tensor &tensor) { +int get_vec_size(const torch::Tensor& tensor) { uint64_t address = reinterpret_cast(tensor.data_ptr()); const int max_aligned_size = 128; const int dtype_size = sizeof(T) * 8; diff --git a/extensions/csrc/kernel/x86/cpu_adam.cpp b/extensions/csrc/kernel/x86/cpu_adam.cpp index be9300c545c2..ebb178533d72 100644 --- a/extensions/csrc/kernel/x86/cpu_adam.cpp +++ b/extensions/csrc/kernel/x86/cpu_adam.cpp @@ -32,8 +32,8 @@ SOFTWARE // C++ interface -void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg, - float *_exp_avg_sq, size_t _param_size, +void Adam_Optimizer::Step_1(float* _params, float* grads, float* _exp_avg, + float* _exp_avg_sq, size_t _param_size, bool param_half_precision, bool grad_half_precision, bool momentum_half_precision, bool variance_half_precision, float loss_scale) { @@ -44,10 +44,10 @@ void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg, float step_size = -1 * _alpha / _bias_correction1; float w_decay = -1 * _alpha * _weight_decay; - __half *params_cast_h = reinterpret_cast<__half *>(_params); - __half *grads_cast_h = reinterpret_cast<__half *>(grads); - __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg); - __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq); + __half* params_cast_h = reinterpret_cast<__half*>(_params); + __half* grads_cast_h = reinterpret_cast<__half*>(grads); + __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg); + __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq); #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__) AVX_Data betta1_4; @@ -182,17 +182,17 @@ void Adam_Optimizer::Step_1(float *_params, float *grads, float *_exp_avg, } } -void Adam_Optimizer::Step_4(float *_params, float *grads, float *_exp_avg, - float *_exp_avg_sq, size_t _param_size, +void Adam_Optimizer::Step_4(float* _params, float* grads, float* _exp_avg, + float* _exp_avg_sq, size_t _param_size, bool param_half_precision, bool grad_half_precision, bool momentum_half_precision, bool variance_half_precision, float loss_scale) { size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * 4); - __half *params_cast_h = reinterpret_cast<__half *>(_params); - __half *grads_cast_h = reinterpret_cast<__half *>(grads); - __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg); - __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq); + __half* params_cast_h = reinterpret_cast<__half*>(_params); + __half* grads_cast_h = reinterpret_cast<__half*>(grads); + __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg); + __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq); #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__) AVX_Data betta1_4; @@ -285,29 +285,29 @@ void Adam_Optimizer::Step_4(float *_params, float *grads, float *_exp_avg, } #endif if (_param_size > rounded_size) - Step_1((param_half_precision ? (float *)(params_cast_h + rounded_size) + Step_1((param_half_precision ? (float*)(params_cast_h + rounded_size) : _params + rounded_size), - (grad_half_precision ? (float *)(grads_cast_h + rounded_size) + (grad_half_precision ? (float*)(grads_cast_h + rounded_size) : grads + rounded_size), - (momentum_half_precision ? (float *)(momentum_cast_h + rounded_size) + (momentum_half_precision ? (float*)(momentum_cast_h + rounded_size) : _exp_avg + rounded_size), - (variance_half_precision ? (float *)(variance_cast_h + rounded_size) + (variance_half_precision ? (float*)(variance_cast_h + rounded_size) : _exp_avg_sq + rounded_size), (_param_size - rounded_size), param_half_precision, grad_half_precision, momentum_half_precision, variance_half_precision, loss_scale); } -void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg, - float *_exp_avg_sq, size_t _param_size, +void Adam_Optimizer::Step_8(float* _params, float* grads, float* _exp_avg, + float* _exp_avg_sq, size_t _param_size, bool param_half_precision, bool grad_half_precision, bool momentum_half_precision, bool variance_half_precision, float loss_scale) { size_t rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * 8); - __half *params_cast_h = reinterpret_cast<__half *>(_params); - __half *grads_cast_h = reinterpret_cast<__half *>(grads); - __half *momentum_cast_h = reinterpret_cast<__half *>(_exp_avg); - __half *variance_cast_h = reinterpret_cast<__half *>(_exp_avg_sq); + __half* params_cast_h = reinterpret_cast<__half*>(_params); + __half* grads_cast_h = reinterpret_cast<__half*>(grads); + __half* momentum_cast_h = reinterpret_cast<__half*>(_exp_avg); + __half* variance_cast_h = reinterpret_cast<__half*>(_exp_avg_sq); #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__) AVX_Data betta1_4; @@ -400,13 +400,13 @@ void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg, } #endif if (_param_size > rounded_size) - Step_4((param_half_precision ? (float *)(params_cast_h + rounded_size) + Step_4((param_half_precision ? (float*)(params_cast_h + rounded_size) : _params + rounded_size), - (grad_half_precision ? (float *)(grads_cast_h + rounded_size) + (grad_half_precision ? (float*)(grads_cast_h + rounded_size) : grads + rounded_size), - (momentum_half_precision ? (float *)(momentum_cast_h + rounded_size) + (momentum_half_precision ? (float*)(momentum_cast_h + rounded_size) : _exp_avg + rounded_size), - (variance_half_precision ? (float *)(variance_cast_h + rounded_size) + (variance_half_precision ? (float*)(variance_cast_h + rounded_size) : _exp_avg_sq + rounded_size), (_param_size - rounded_size), param_half_precision, grad_half_precision, momentum_half_precision, @@ -415,18 +415,18 @@ void Adam_Optimizer::Step_8(float *_params, float *grads, float *_exp_avg, void Adam_Optimizer::step(size_t step, float lr, float beta1, float beta2, float epsilon, float weight_decay, - bool bias_correction, torch::Tensor ¶ms, - torch::Tensor &grads, torch::Tensor &exp_avg, - torch::Tensor &exp_avg_sq, float loss_scale) { + bool bias_correction, torch::Tensor& params, + torch::Tensor& grads, torch::Tensor& exp_avg, + torch::Tensor& exp_avg_sq, float loss_scale) { auto params_c = params.contiguous(); auto grads_c = grads.contiguous(); auto exp_avg_c = exp_avg.contiguous(); auto exp_avg_sq_c = exp_avg_sq.contiguous(); - float *params_ptr = (float *)params_c.data_ptr(); - float *grads_ptr = (float *)grads_c.data_ptr(); - float *exp_avg_ptr = (float *)exp_avg_c.data_ptr(); - float *exp_avg_sq_ptr = (float *)exp_avg_sq_c.data_ptr(); + float* params_ptr = (float*)params_c.data_ptr(); + float* grads_ptr = (float*)grads_c.data_ptr(); + float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); + float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); this->IncrementStep(step, beta1, beta2); this->update_state(lr, epsilon, weight_decay, bias_correction); diff --git a/extensions/csrc/kernel/x86/cpu_adam.h b/extensions/csrc/kernel/x86/cpu_adam.h index 45e1dde6242d..2e32066ee4f7 100644 --- a/extensions/csrc/kernel/x86/cpu_adam.h +++ b/extensions/csrc/kernel/x86/cpu_adam.h @@ -49,10 +49,10 @@ SOFTWARE #define SIMD_SQRT(x) _mm512_sqrt_ps(x) #define SIMD_DIV(x, y) _mm512_div_ps(x, y) #define SIMD_LOAD_HALF(x) \ - _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) -#define SIMD_STORE_HALF(x, d) \ - _mm256_storeu_ps((float *)(x), _mm256_castsi256_ps(_mm512_cvtps_ph( \ - d, _MM_FROUND_TO_NEAREST_INT))) + _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)(x))) +#define SIMD_STORE_HALF(x, d) \ + _mm256_storeu_ps((float*)(x), _mm256_castsi256_ps(_mm512_cvtps_ph( \ + d, _MM_FROUND_TO_NEAREST_INT))) #elif defined(__AVX256__) or defined(__AVX2__) #define SIMD_WIDTH 8 @@ -65,10 +65,10 @@ SOFTWARE #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c) #define SIMD_SQRT(x) _mm256_sqrt_ps(x) #define SIMD_DIV(x, y) _mm256_div_ps(x, y) -#define SIMD_LOAD_HALF(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) -#define SIMD_STORE_HALF(x, d) \ - _mm_storeu_ps((float *)(x), _mm_castsi128_ps(_mm256_cvtps_ph( \ - d, _MM_FROUND_TO_NEAREST_INT))) +#define SIMD_LOAD_HALF(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)(x))) +#define SIMD_STORE_HALF(x, d) \ + _mm_storeu_ps((float*)(x), _mm_castsi128_ps(_mm256_cvtps_ph( \ + d, _MM_FROUND_TO_NEAREST_INT))) #endif @@ -85,7 +85,7 @@ union AVX_Data { #define STEP(SPAN) \ void Step_##SPAN( \ - float *_params, float *grads, float *_exp_avg, float *_exp_avg_sq, \ + float* _params, float* grads, float* _exp_avg, float* _exp_avg_sq, \ size_t _param_size, bool param_half_precision = false, \ bool grad_half_precision = false, bool momentum_half_precision = false, \ bool variance_half_precision = false, float loss_scale = -1); @@ -143,8 +143,8 @@ class Adam_Optimizer { } #if defined(__AVX512__) or defined(__AVX256__) or defined(__AVX2__) - inline void simd_load(bool is_half, float *ptr, __half *h_ptr, - AVX_Data &data) { + inline void simd_load(bool is_half, float* ptr, __half* h_ptr, + AVX_Data& data) { if (is_half) { data.data = SIMD_LOAD_HALF(h_ptr); } else { @@ -152,8 +152,8 @@ class Adam_Optimizer { } } - inline void simd_store(bool is_half, float *ptr, __half *h_ptr, - AVX_Data &data) { + inline void simd_store(bool is_half, float* ptr, __half* h_ptr, + AVX_Data& data) { if (is_half) { SIMD_STORE_HALF(h_ptr, data.data); } else { @@ -163,9 +163,9 @@ class Adam_Optimizer { #endif void step(size_t step, float lr, float beta1, float beta2, float epsilon, - float weight_decay, bool bias_correction, torch::Tensor ¶ms, - torch::Tensor &grads, torch::Tensor &exp_avg, - torch::Tensor &exp_avg_sq, float loss_scale); + float weight_decay, bool bias_correction, torch::Tensor& params, + torch::Tensor& grads, torch::Tensor& exp_avg, + torch::Tensor& exp_avg_sq, float loss_scale); private: float _alpha; diff --git a/extensions/pybind/layernorm/layer_norm.cpp b/extensions/pybind/layernorm/layer_norm.cpp index 77c4e38c8150..550f95d158e0 100644 --- a/extensions/pybind/layernorm/layer_norm.cpp +++ b/extensions/pybind/layernorm/layer_norm.cpp @@ -11,8 +11,8 @@ namespace { -void compute_n1_n2(at::Tensor input, at::IntArrayRef normalized_shape, int &n1, - int &n2) { +void compute_n1_n2(at::Tensor input, at::IntArrayRef normalized_shape, int& n1, + int& n2) { int idiff = input.ndimension() - normalized_shape.size(); n2 = 1; for (int i = 0; i < (int)normalized_shape.size(); ++i) { @@ -31,8 +31,8 @@ void check_args(at::IntArrayRef normalized_shape, at::Tensor gamma, TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape)); } -void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int &n1, - int &n2) { +void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int& n1, + int& n2) { int64_t normalized_ndim = normalized_shape.size(); if (normalized_ndim < 1) { @@ -63,16 +63,16 @@ void check_args(at::Tensor input, at::IntArrayRef normalized_shape, int &n1, } void check_args(at::Tensor input, at::IntArrayRef normalized_shape, - at::Tensor gamma, at::Tensor beta, int &n1, int &n2) { + at::Tensor gamma, at::Tensor beta, int& n1, int& n2) { check_args(input, normalized_shape, n1, n2); check_args(normalized_shape, gamma, beta); } } // namespace -void cuda_layer_norm(at::Tensor *output, at::Tensor *mean, at::Tensor *invvar, - at::Tensor *input, int n1, int n2, - at::IntArrayRef normalized_shape, at::Tensor *gamma, - at::Tensor *beta, double epsilon); +void cuda_layer_norm(at::Tensor* output, at::Tensor* mean, at::Tensor* invvar, + at::Tensor* input, int n1, int n2, + at::IntArrayRef normalized_shape, at::Tensor* gamma, + at::Tensor* beta, double epsilon); #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ @@ -103,12 +103,12 @@ std::vector layer_norm_affine(at::Tensor input, return {output, mean, invvar}; } -void cuda_layer_norm_gradient(at::Tensor *dout, at::Tensor *mean, - at::Tensor *invvar, at::Tensor *input, int n1, +void cuda_layer_norm_gradient(at::Tensor* dout, at::Tensor* mean, + at::Tensor* invvar, at::Tensor* input, int n1, int n2, at::IntArrayRef normalized_shape, - at::Tensor *gamma, at::Tensor *beta, - double epsilon, at::Tensor *grad_input, - at::Tensor *grad_gamma, at::Tensor *grad_beta); + at::Tensor* gamma, at::Tensor* beta, + double epsilon, at::Tensor* grad_input, + at::Tensor* grad_gamma, at::Tensor* grad_beta); std::vector layer_norm_gradient_affine( at::Tensor dout, at::Tensor mean, at::Tensor invvar, at::Tensor input, diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py index 53dd3c8dd3ba..f5750cf7be0c 100644 --- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py @@ -41,7 +41,7 @@ def exam_state_dict_with_origin( ): from transformers import BertForSequenceClassification - (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) + model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) bert_model = model_fn() enable_flash_attention = True if tp_size > 1 else False @@ -101,7 +101,7 @@ def exam_state_dict( use_async: bool, low_cpu_mem_mode: bool, ): - (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) + model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() enable_flash_attention = True if tp_size > 1 else False enable_fused_normalization = True if tp_size > 1 else False diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py index ce4d10322ba5..4cc5e258c85b 100644 --- a/tests/test_checkpoint_io/test_gemini_torch_compability.py +++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py @@ -22,7 +22,7 @@ @parameterize("shard", [False, True]) @parameterize("model_name", ["transformers_llama_for_causal_lm"]) def exam_torch_load_from_gemini(shard: bool, model_name: str): - (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) + model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14)) booster = Booster(plugin=plugin) @@ -88,7 +88,7 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str): @parameterize("shard", [False, True]) @parameterize("model_name", ["transformers_gpt"]) def exam_gemini_load_from_torch(shard: bool, model_name: str): - (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) + model_fn, data_gen_fn, output_transform_fn, _, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() plugin = TorchDDPPlugin() booster = Booster(plugin=plugin) diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py index a338d98f4746..e78e0f6457a9 100644 --- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py @@ -48,9 +48,7 @@ def exam_state_dict( shard: bool, model_name: str, size_per_shard: int, test_config: dict, use_async: bool, low_cpu_mem_mode: bool ): - (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( - iter(model_zoo.get_sub_registry(model_name).values()) - ) + model_fn, data_gen_fn, output_transform_fn, loss_fn, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = loss_fn plugin = HybridParallelPlugin(**test_config) booster = Booster(plugin=plugin) diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py index 6f8eb2ad26cd..4fdd2a42d97e 100644 --- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py +++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py @@ -21,9 +21,7 @@ @parameterize("model_name", ["transformers_llama_for_causal_lm"]) @parameterize("plugin_type", ["ddp", "zero", "gemini"]) def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32): - (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( - iter(model_zoo.get_sub_registry(model_name).values()) - ) + model_fn, data_gen_fn, output_transform_fn, loss_fn, _ = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = loss_fn if plugin_type == "ddp": diff --git a/tests/test_tensor/test_mix_gather.py b/tests/test_tensor/test_mix_gather.py index 6dbbe5de6ff1..100405bef6b4 100644 --- a/tests/test_tensor/test_mix_gather.py +++ b/tests/test_tensor/test_mix_gather.py @@ -13,7 +13,7 @@ def check_mix_gather_S0S1(device_mesh, rank): tensor_to_check = torch.arange(64).reshape((8, 8)).cuda() - (f, b) = (0, 1) + f, b = (0, 1) f_target_pair = (f, [0]) b_target_pair = (b, [1]) gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair) @@ -89,7 +89,7 @@ def check_two_all_gather_S0S1(device_mesh, rank): def check_mix_gather_S1S0(device_mesh, rank): tensor_to_check = torch.arange(64).reshape((8, 8)).cuda() - (f, b) = (0, 1) + f, b = (0, 1) f_target_pair = (f, [1]) b_target_pair = (b, [0]) gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair) @@ -165,7 +165,7 @@ def check_two_all_gather_S1S0(device_mesh, rank): def check_mix_gather_S01R(device_mesh, rank): tensor_to_check = torch.arange(64).reshape((8, 8)).cuda() - (f, b) = (0, 1) + f, b = (0, 1) f_target_pair = (f, [0, 1]) b_target_pair = (b, []) gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair) @@ -231,7 +231,7 @@ def check_two_all_gather_S01R(device_mesh, rank): def check_mix_gather_RS01(device_mesh, rank): tensor_to_check = torch.arange(64).reshape((8, 8)).cuda() - (f, b) = (0, 1) + f, b = (0, 1) f_target_pair = (f, []) b_target_pair = (b, [0, 1]) gather_dim, logical_process_axes = mix_gather_simulator(f_target_pair, b_target_pair)