Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
repos:

- repo: https://github.com/PyCQA/autoflake
rev: v2.3.1
rev: v2.3.3
hooks:
- id: autoflake
name: autoflake (python)
args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']

- repo: https://github.com/pycqa/isort
rev: 5.13.2
rev: 8.0.1
hooks:
- id: isort
name: sort all imports (python)
args: ["--profile", "black"] # avoid conflict with black

- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.10.0
rev: 26.3.1
hooks:
- id: black
name: black formatter
args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']

- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.5
rev: v22.1.2
hooks:
- id: clang-format
name: clang formatter
types_or: [c++, c]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
hooks:
- id: check-yaml
- id: check-merge-conflict
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
Initialize new model with updated tokenizer by calculating the mean values from original model
"""

import argparse

import numpy as np
Expand Down
28 changes: 10 additions & 18 deletions applications/ColossalChat/coati/dataset/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,8 @@ def tokenize_sft(
template.messages = []
for idx, mess in enumerate(messages):
if mess["from"] != template.roles[idx % 2]:
raise ValueError(
f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{messages}"
)
raise ValueError(f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{messages}")
template.append_message(mess["from"], mess["content"])

if len(template.messages) % 2 != 0:
Expand Down Expand Up @@ -245,10 +243,8 @@ def tokenize_rlhf(

for idx, mess in enumerate(context):
if mess["from"] != template.roles[idx % 2]:
raise ValueError(
f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{context}"
)
raise ValueError(f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{context}")
template.append_message(mess["from"], mess["content"])

if len(template.messages) % 2 != 1:
Expand All @@ -272,18 +268,14 @@ def tokenize_rlhf(
rejected_continuation = data_point["rejected"]
for round in range(len(chosen_continuation)):
if chosen_continuation[round]["from"] != template.roles[(round + 1) % 2]:
raise ValueError(
f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{chosen_continuation}"
)
raise ValueError(f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{chosen_continuation}")
chosen.append_message(chosen_continuation[round]["from"], chosen_continuation[round]["content"])

for round in range(len(rejected_continuation)):
if rejected_continuation[round]["from"] != template.roles[(round + 1) % 2]:
raise ValueError(
f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{rejected_continuation}"
)
raise ValueError(f"Message should iterate between user and assistant and starts with a \
line from the user. Got the following data:\n{rejected_continuation}")
rejected.append_message(rejected_continuation[round]["from"], rejected_continuation[round]["content"])

(
Expand All @@ -296,14 +288,14 @@ def tokenize_rlhf(
) = (None, None, None, None, None, None)

chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer)
(chosen_input_ids, chosen_loss_mask, chosen_label_decode) = (
chosen_input_ids, chosen_loss_mask, chosen_label_decode = (
chosen_data_packed["input_ids"],
chosen_data_packed["loss_mask"],
chosen_data_packed["label_decode"],
)

rejected_data_packed = apply_rlhf_data_format(rejected, tokenizer)
(rejected_input_ids, rejected_loss_mask, rejected_label_decode) = (
rejected_input_ids, rejected_loss_mask, rejected_label_decode = (
rejected_data_packed["input_ids"],
rejected_data_packed["loss_mask"],
rejected_data_packed["label_decode"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
https://github.com/volcengine/verl
"""


import json

import torch
Expand Down
4 changes: 2 additions & 2 deletions applications/ColossalChat/coati/trainer/kto.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def _train(self, epoch: int):
)
for i, batch in enumerate(self.train_dataloader):
batch = to_device(batch, self.device)
(input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
batch["input_ids"],
batch["attention_mask"],
batch["loss_mask"],
Expand Down Expand Up @@ -279,7 +279,7 @@ def _eval(self, epoch: int):
)
for i, batch in enumerate(self.train_dataloader):
batch = to_device(batch, self.device)
(input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask = (
batch["input_ids"],
batch["attention_mask"],
batch["loss_mask"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _init_optimizer(self):
def _prepare_model_with_strategy(self, has_optimizer: bool):
if has_optimizer:
self._init_optimizer()
(self._model, self._optimizer) = self._strategy.prepare((self._model, self._optimizer))
self._model, self._optimizer = self._strategy.prepare((self._model, self._optimizer))
else:
self._model = self._strategy.prepare(self._model)

Expand Down
4 changes: 2 additions & 2 deletions applications/ColossalQA/examples/webui_demo/webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ def restart(chatbot, txt):
)
with gr.Row():
btn = gr.UploadButton("📁", file_types=["file"], file_count="multiple", size="sm")
restart_btn = gr.Button(str("\u21BB"), elem_id="restart-btn", scale=1)
restart_btn = gr.Button(str("\u21bb"), elem_id="restart-btn", scale=1)
txt = gr.Textbox(
scale=8,
show_label=False,
placeholder="Enter text and press enter, or use 📁 to upload files, click \u21BB to clear loaded files and restart chat",
placeholder="Enter text and press enter, or use 📁 to upload files, click \u21bb to clear loaded files and restart chat",
container=True,
autofocus=True,
)
Expand Down
4 changes: 2 additions & 2 deletions colossalai/auto_parallel/tensor_shard/solver/solver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""This code is adapted from Alpa
https://github.com/alpa-projects/alpa/
with some changes. """
https://github.com/alpa-projects/alpa/
with some changes."""

import multiprocessing
import time
Expand Down
2 changes: 1 addition & 1 deletion colossalai/autochunk/select_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def _select_min_memory_chunk_region(self, possible_chunk_regions, chunk_infos):
return best_region

def _is_legal_region(self, cur_chunk_info, chunk_infos):
(chunk_region_start, chunk_region_end) = cur_chunk_info["region"]
chunk_region_start, chunk_region_end = cur_chunk_info["region"]
if cur_chunk_info in chunk_infos:
return False
if chunk_region_end < chunk_region_start:
Expand Down
6 changes: 2 additions & 4 deletions colossalai/booster/plugin/gemini_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,8 @@ def load_sharded_optimizer(
# Load param_groups.
param_group_path = ckpt_index_file.get_param_group_filename()
if param_group_path is None:
raise RuntimeError(
f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory."
)
raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory.")
saved_param_groups = torch.load(param_group_path)
optimizer.load_param_groups(saved_param_groups)

Expand Down
6 changes: 2 additions & 4 deletions colossalai/booster/plugin/low_level_zero_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,10 +268,8 @@ def load_sharded_optimizer(
# Load param_groups
param_group_path = ckpt_index_file.get_param_group_filename()
if param_group_path is None:
raise RuntimeError(
f"Invalid index file path {index_file_path} for an optimizer. \
Lacking param group file under current directory."
)
raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
Lacking param group file under current directory.")
id_map = load_param_groups_into_optimizer(optimizer, param_group_path)

checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
Expand Down
6 changes: 2 additions & 4 deletions colossalai/checkpoint_io/general_checkpoint_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,8 @@ def load_sharded_optimizer(
# Load param_groups
param_group_path = ckpt_index_file.get_param_group_filename()
if param_group_path is None:
raise RuntimeError(
f"Invalid index file path {index_file_path} for an optimizer. \
Lacking param group file under current directory."
)
raise RuntimeError(f"Invalid index file path {index_file_path} for an optimizer. \
Lacking param group file under current directory.")
id_map = load_param_groups_into_optimizer(optimizer, param_group_path)

checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
Expand Down
6 changes: 2 additions & 4 deletions colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,10 +690,8 @@ def _get_param_id_from_optimizer_param(
# Load param_groups
param_group_path = ckpt_index_file.get_param_group_filename()
if param_group_path is None:
raise RuntimeError(
f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory."
)
raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory.")
saved_groups = torch.load(param_group_path)

updated_groups = []
Expand Down
6 changes: 2 additions & 4 deletions colossalai/checkpoint_io/moe_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,8 @@ def _get_param_id_from_optimizer_param(
# Load param_groups
param_group_path = ckpt_index_file.get_param_group_filename()
if param_group_path is None:
raise RuntimeError(
f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory."
)
raise RuntimeError(f"Invalid index file path {checkpoint_index_file} for an optimizer. \
Lacking param group file under current directory.")
saved_groups = torch.load(param_group_path)

updated_groups = []
Expand Down
4 changes: 2 additions & 2 deletions colossalai/device/alpha_beta_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def profile_latency(self, process_group, pg_handler):
latency_list = []
for i in range(self.latency_iters):
nbytes = int(BYTE << i)
(t, _) = self._profile(process_group, pg_handler, nbytes)
t, _ = self._profile(process_group, pg_handler, nbytes)
latency_list.append(t)

if latency_list[0] is None:
Expand All @@ -157,7 +157,7 @@ def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)):
process_group: A tuple of global rank of the process group.
pg_handler: The handler of the process group.
"""
(_, bandwidth) = self._profile(process_group, pg_handler, maxbytes)
_, bandwidth = self._profile(process_group, pg_handler, maxbytes)
return bandwidth

def profile_ab(self):
Expand Down
4 changes: 2 additions & 2 deletions colossalai/device/device_mesh.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""This code is adapted from Alpa
https://github.com/alpa-projects/alpa/
with some changes. """
https://github.com/alpa-projects/alpa/
with some changes."""

import operator
from dataclasses import dataclass
Expand Down
1 change: 1 addition & 0 deletions colossalai/fx/tracer/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Implemented a tracer which supports control flow and user-defined meta arguments.
The implementation is partly inspired HuggingFace's fx tracer
"""

import enum
import functools
import inspect
Expand Down
4 changes: 1 addition & 3 deletions colossalai/inference/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,7 @@ def recycle(self) -> None:
"""
Recycle a running sequnce to waiitting list
"""
assert (
not self.check_finish() and not self.status == RequestStatus.ABORTED
), "The running sequence \
assert not self.check_finish() and not self.status == RequestStatus.ABORTED, "The running sequence \
is already done but it still in running list"
self.status = RequestStatus.RECYCLED

Expand Down
2 changes: 1 addition & 1 deletion colossalai/kernel/triton/llama_act_combine_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def forward(ctx: Any, x_gate: torch.Tensor, x_up: torch.Tensor, activation: str
@custom_bwd
def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, Tensor, None, None]:
# restore from ctx
(x_gate1, x_gate2, x_up) = ctx.saved_tensors
x_gate1, x_gate2, x_up = ctx.saved_tensors
M, N, BLOCK_SIZE, num_warps = ctx.M, ctx.N, ctx.BLOCK_SIZE, ctx.num_warps

# init grad
Expand Down
6 changes: 2 additions & 4 deletions colossalai/legacy/engine/schedule/_base_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,6 @@ def _call_engine_criterion(engine, outputs, labels):
elif isinstance(outputs, dict) and isinstance(labels, (list, tuple)):
raise ValueError(f"Expected labels to be a dict when the model outputs are dict, but got {type(labels)}")
else:
raise TypeError(
f"Expected model outputs and labels to be of type torch.Tensor ' \
raise TypeError(f"Expected model outputs and labels to be of type torch.Tensor ' \
'(which is auto-converted to tuple), list, tuple, or dict, ' \
'but got {type(outputs)} (model outputs) and {type(labels)} (labels)"
)
'but got {type(outputs)} (model outputs) and {type(labels)} (labels)")
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Adapted from https://github.com/ModelTC/lightllm

"""Sampling parameters for text generation."""

from typing import List, Optional, Union

_SAMPLING_EPS = 1e-5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def bloom_attention_forward(
fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]

# 3 x [batch_size, seq_length, num_heads, head_dim]
(query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
query_layer, key_layer, value_layer = self._split_heads(fused_qkv)
batch_size, q_length, H, D_HEAD = query_layer.shape
k = key_layer.reshape(-1, H, D_HEAD) # batch_size * q_length, H, D_HEAD, q_lenth == 1
v = value_layer.reshape(-1, H, D_HEAD) # batch_size * q_length, H, D_HEAD, q_lenth == 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def chatglm_flash_attn_kvcache_forward(
# Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
mixed_x_layer = self.query_key_value(hidden_states)
if self.multi_query_attention:
(query_layer, key_layer, value_layer) = mixed_x_layer.split(
query_layer, key_layer, value_layer = mixed_x_layer.split(
[
self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
Expand Down Expand Up @@ -436,7 +436,7 @@ def chatglm_flash_attn_kvcache_forward(
)
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
(query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
query_layer, key_layer, value_layer = split_tensor_along_last_dim(mixed_x_layer, 3)
cos, sin = infer_state.position_cos, infer_state.position_sin

chatglm2_rotary_emb_fwd(
Expand Down
3 changes: 2 additions & 1 deletion colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch OpenMoE model."""
"""PyTorch OpenMoE model."""

import math
from typing import List, Optional, Tuple, Union

Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/nn/layer/parallel_sequence/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def forward(self, hidden_states, attention_mask):
"the last dimension is not a multiple of 3, " "cannot be divided into query, key and value"
)
partition_size = last_dim_value // 3
(query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, partition_size, dim=last_dim)
query_layer, key_layer, value_layer = torch.split(mixed_x_layer, partition_size, dim=last_dim)

# attention scores: [batch_size, num_heads, sub_seq_len, seq_len]
output_size = (
Expand Down
4 changes: 2 additions & 2 deletions colossalai/nn/layer/layernorm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""This code is from NVIDIA apex:
https://github.com/NVIDIA/apex
with some changes. """
https://github.com/NVIDIA/apex
with some changes."""

import numbers

Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/distributed_galore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""

import warnings
from collections import defaultdict
Expand Down
1 change: 1 addition & 0 deletions colossalai/nn/optimizer/fused_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Licensed under the MIT License.
"""

import torch

from colossalai.utils import get_current_device, multi_tensor_applier
Expand Down
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/galore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""
"""adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""

import warnings
from typing import List
Expand Down
Loading
Loading