diff --git a/angelslim/models/llm/__init__.py b/angelslim/models/llm/__init__.py index f54581b6..cb3b45ff 100644 --- a/angelslim/models/llm/__init__.py +++ b/angelslim/models/llm/__init__.py @@ -18,3 +18,4 @@ from .kimi_k2 import KimiK2 # noqa: F401 from .llama import Llama # noqa: F401 from .qwen import Qwen # noqa: F401 +from .seed_oss import SeedOss # noqa: F401 diff --git a/angelslim/models/llm/seed_oss.py b/angelslim/models/llm/seed_oss.py new file mode 100644 index 00000000..7e1ca3df --- /dev/null +++ b/angelslim/models/llm/seed_oss.py @@ -0,0 +1,95 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import torch.nn as nn + +from ...compressor.quant.core import PTQSaveVllmHF +from ..base_model import BaseLLMModel +from ..model_factory import SlimModelFactory + + +@SlimModelFactory.register +class SeedOss(BaseLLMModel): + def __init__( + self, + model=None, + deploy_backend="vllm", + ): + super().__init__( + model=model, + deploy_backend=deploy_backend, + ) + self.block_name = "model.layers" + + def get_observer_layers(self): + names = [ + "k_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ] + obs_layers = [nn.Linear] + observer_layers_dict = {} + layers_dict = self.find_layers(self.model, layers=obs_layers) + + ignore_layers = self.skip_layer_names() + for name, module in layers_dict.items(): + if name.startswith(self.block_name) and name.split(".")[-1] in names: + observer_layers_dict[name] = module + else: + ignore_layers.append(name) + self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers + + if self.quant_config.custom_observe_layers_names != "default": + for custom_observe_name in self.quant_config.custom_observe_layers_names: + for default_name in observer_layers_dict.keys(): + if custom_observe_name not in default_name: + observer_layers_dict.pop(default_name) + return observer_layers_dict + + def get_smooth_mapping_layers(self, smooth_config, mappings=None): + if mappings is None: + mappings = [ + (["q_proj", "k_proj", "v_proj"], "input_layernorm"), + (["gate_proj", "up_proj"], "post_attention_layernorm"), + ] + print(f"smooth mappings={mappings}") + assert len(mappings) == 2 + assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears + # TODO: support smooth_last_linears + return super().get_smooth_mapping_layers(smooth_config, mappings) + + def get_parent_dict(self, observer_layers_dict): + parent_mapping = {r"experts\.\d+": "experts"} + parent_dict = {} + for layer_name in observer_layers_dict.keys(): + parent_name = layer_name + for k, v in parent_mapping.items(): + parent_name = re.sub(k, v, layer_name) + if parent_name != layer_name: + parent_dict[layer_name] = parent_name + return parent_dict + + def get_save_func(self): + if self.deploy_backend in ["vllm", "huggingface"]: + return PTQSaveVllmHF + else: + raise NotImplementedError( + f"deploy_backend {self.deploy_backend} is not supported for saving." + ) diff --git a/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml b/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml new file mode 100644 index 00000000..59593380 --- /dev/null +++ b/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml @@ -0,0 +1,27 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: SeedOss + model_path: ByteDance-Seed/Seed-OSS-36B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_dynamic + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + diff --git a/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml b/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml new file mode 100644 index 00000000..da22b7f8 --- /dev/null +++ b/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: SeedOss + model_path: ByteDance-Seed/Seed-OSS-36B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 256 + batch_size: 1 diff --git a/docs/source/performance/quantization/benchmarks.md b/docs/source/performance/quantization/benchmarks.md index dcc65763..a5392ce4 100644 --- a/docs/source/performance/quantization/benchmarks.md +++ b/docs/source/performance/quantization/benchmarks.md @@ -176,7 +176,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT` +-------------------+------------------+----------+------------+--------------+ | Model | Quantization | MMMU_VAL | DocVQA_VAL | ChartQA_TEST | +===================+==================+==========+============+==============+ - | Qwen2.5VL-3B | BF16 | 47.11 | 78.57 | 80.32 | + | Qwen2.5VL-3B | BF16 | 47.11 | 78.57 | 80.32 | + +------------------+----------+------------+--------------+ | | FP8-Static | 47.33 | 79.34 | 79.68 | + +------------------+----------+------------+--------------+ @@ -190,7 +190,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT` + +------------------+----------+------------+--------------+ | | INT4-AWQ | 45.78 | - | 79.60 | +-------------------+------------------+----------+------------+--------------+ - | Qwen2.5VL-7B | BF16 | 45.44 | 89.71 | 84.64 | + | Qwen2.5VL-7B | BF16 | 45.44 | 89.71 | 84.64 | | +------------------+----------+------------+--------------+ | | FP8-Static | 47.00 | 89.83 | 85.92 | + +------------------+----------+------------+--------------+ @@ -204,7 +204,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT` + +------------------+----------+------------+--------------+ | | INT4-AWQ | 45.67 | 89.28 | - | +-------------------+------------------+----------+------------+--------------+ - | Qwen2.5VL-32B | BF16 | 57.00 | 90.03 | - | + | Qwen2.5VL-32B | BF16 | 57.00 | 90.03 | - | | +------------------+----------+------------+--------------+ | | FP8-Static | 57.00 | 89.88 | - | + +------------------+----------+------------+--------------+ @@ -218,7 +218,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT` + +------------------+----------+------------+--------------+ | | INT4-AWQ | 55.22 | 90.30 | - | +-------------------+------------------+----------+------------+--------------+ - | Qwen2.5VL-72B | BF16 | 58.78 | 94.39 | 85.60 | + | Qwen2.5VL-72B | BF16 | 58.78 | 94.39 | 85.60 | | +------------------+----------+------------+--------------+ | | FP8-Static | 57.89 | 94.41 | 85.84 | + +------------------+----------+------------+--------------+ @@ -253,6 +253,29 @@ DeepSeek-R1-0528模型的`FP8-Block-Wise`、`W4A8-FP8`在`GPQA Diamond`、`AIME +-----------------------+----------------+--------------+-----------+----------+---------------+ ``` +## Seed-OSS-36B-Instruct + +Seed-OSS-36B-Instruct模型的`FP8-Static`、`FP8-Dynamic`在`CEVAL`、`MMLU`、`GSM8K`、`HUMANEVAL`上的评测结果如下: + +```{eval-rst} +.. table:: + :align: center + :name: table-seed-oss-36b-performance + + +-------------------------+----------------+---------+--------+----------------+------------------+-------------+ + | Model | Quantization | CEVAL | MMLU | GSM8K-strict | GSM8K-flexible | HUMANEVAL | + +=========================+================+=========+========+================+==================+=============+ + | Seed-OSS-36B-Instruct | BF16 | 88.19 | 82.97 | 70.36 | 97.12 | 87.20 | + + +----------------+---------+--------+----------------+------------------+-------------+ + | | FP8-Static | 87.82 | 82.79 | 74.75 | 96.51 | 86.59 | + + +----------------+---------+--------+----------------+------------------+-------------+ + | | FP8-Dynamic | 87.82 | 82.64 | 74.15 | 96.89 | 87.20 | + +-------------------------+----------------+---------+--------+----------------+------------------+-------------+ + +``` + +该数据使用[lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)工具评测,注意需要设置`--gen_kwargs max_gen_toks`防止思考内容过长被截断。 + ## 其他模型