Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions angelslim/models/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
from .kimi_k2 import KimiK2 # noqa: F401
from .llama import Llama # noqa: F401
from .qwen import Qwen # noqa: F401
from .seed_oss import SeedOss # noqa: F401
95 changes: 95 additions & 0 deletions angelslim/models/llm/seed_oss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright 2025 Tencent Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

import torch.nn as nn

from ...compressor.quant.core import PTQSaveVllmHF
from ..base_model import BaseLLMModel
from ..model_factory import SlimModelFactory


@SlimModelFactory.register
class SeedOss(BaseLLMModel):
def __init__(
self,
model=None,
deploy_backend="vllm",
):
super().__init__(
model=model,
deploy_backend=deploy_backend,
)
self.block_name = "model.layers"

def get_observer_layers(self):
names = [
"k_proj",
"v_proj",
"q_proj",
"o_proj",
"up_proj",
"gate_proj",
"down_proj",
]
obs_layers = [nn.Linear]
observer_layers_dict = {}
layers_dict = self.find_layers(self.model, layers=obs_layers)

ignore_layers = self.skip_layer_names()
for name, module in layers_dict.items():
if name.startswith(self.block_name) and name.split(".")[-1] in names:
observer_layers_dict[name] = module
else:
ignore_layers.append(name)
self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers

if self.quant_config.custom_observe_layers_names != "default":
for custom_observe_name in self.quant_config.custom_observe_layers_names:
for default_name in observer_layers_dict.keys():
if custom_observe_name not in default_name:
observer_layers_dict.pop(default_name)
return observer_layers_dict

def get_smooth_mapping_layers(self, smooth_config, mappings=None):
if mappings is None:
mappings = [
(["q_proj", "k_proj", "v_proj"], "input_layernorm"),
(["gate_proj", "up_proj"], "post_attention_layernorm"),
]
print(f"smooth mappings={mappings}")
assert len(mappings) == 2
assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears
# TODO: support smooth_last_linears
return super().get_smooth_mapping_layers(smooth_config, mappings)

def get_parent_dict(self, observer_layers_dict):
parent_mapping = {r"experts\.\d+": "experts"}
parent_dict = {}
for layer_name in observer_layers_dict.keys():
parent_name = layer_name
for k, v in parent_mapping.items():
parent_name = re.sub(k, v, layer_name)
if parent_name != layer_name:
parent_dict[layer_name] = parent_name
return parent_dict

def get_save_func(self):
if self.deploy_backend in ["vllm", "huggingface"]:
return PTQSaveVllmHF
else:
raise NotImplementedError(
f"deploy_backend {self.deploy_backend} is not supported for saving."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: SeedOss
model_path: ByteDance-Seed/Seed-OSS-36B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_dynamic
bits: 8
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"

34 changes: 34 additions & 0 deletions configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: SeedOss
model_path: ByteDance-Seed/Seed-OSS-36B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_static
bits: 8
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"

# Dataset for calibration
dataset:
name: TextDataset
data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
max_seq_length: 4096
num_samples: 256
batch_size: 1
31 changes: 27 additions & 4 deletions docs/source/performance/quantization/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT`
+-------------------+------------------+----------+------------+--------------+
| Model | Quantization | MMMU_VAL | DocVQA_VAL | ChartQA_TEST |
+===================+==================+==========+============+==============+
| Qwen2.5VL-3B | BF16 | 47.11 | 78.57 | 80.32 |
| Qwen2.5VL-3B | BF16 | 47.11 | 78.57 | 80.32 |
+ +------------------+----------+------------+--------------+
| | FP8-Static | 47.33 | 79.34 | 79.68 |
+ +------------------+----------+------------+--------------+
Expand All @@ -190,7 +190,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT`
+ +------------------+----------+------------+--------------+
| | INT4-AWQ | 45.78 | - | 79.60 |
+-------------------+------------------+----------+------------+--------------+
| Qwen2.5VL-7B | BF16 | 45.44 | 89.71 | 84.64 |
| Qwen2.5VL-7B | BF16 | 45.44 | 89.71 | 84.64 |
| +------------------+----------+------------+--------------+
| | FP8-Static | 47.00 | 89.83 | 85.92 |
+ +------------------+----------+------------+--------------+
Expand All @@ -204,7 +204,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT`
+ +------------------+----------+------------+--------------+
| | INT4-AWQ | 45.67 | 89.28 | - |
+-------------------+------------------+----------+------------+--------------+
| Qwen2.5VL-32B | BF16 | 57.00 | 90.03 | - |
| Qwen2.5VL-32B | BF16 | 57.00 | 90.03 | - |
| +------------------+----------+------------+--------------+
| | FP8-Static | 57.00 | 89.88 | - |
+ +------------------+----------+------------+--------------+
Expand All @@ -218,7 +218,7 @@ Qwen2.5VL系列模型的`BF16`、`FP8-Static`、`FP8-Dynamic`、`FP8-Static-ViT`
+ +------------------+----------+------------+--------------+
| | INT4-AWQ | 55.22 | 90.30 | - |
+-------------------+------------------+----------+------------+--------------+
| Qwen2.5VL-72B | BF16 | 58.78 | 94.39 | 85.60 |
| Qwen2.5VL-72B | BF16 | 58.78 | 94.39 | 85.60 |
| +------------------+----------+------------+--------------+
| | FP8-Static | 57.89 | 94.41 | 85.84 |
+ +------------------+----------+------------+--------------+
Expand Down Expand Up @@ -253,6 +253,29 @@ DeepSeek-R1-0528模型的`FP8-Block-Wise`、`W4A8-FP8`在`GPQA Diamond`、`AIME
+-----------------------+----------------+--------------+-----------+----------+---------------+
```

## Seed-OSS-36B-Instruct

Seed-OSS-36B-Instruct模型的`FP8-Static`、`FP8-Dynamic`在`CEVAL`、`MMLU`、`GSM8K`、`HUMANEVAL`上的评测结果如下:

```{eval-rst}
.. table::
:align: center
:name: table-seed-oss-36b-performance

+-------------------------+----------------+---------+--------+----------------+------------------+-------------+
| Model | Quantization | CEVAL | MMLU | GSM8K-strict | GSM8K-flexible | HUMANEVAL |
+=========================+================+=========+========+================+==================+=============+
| Seed-OSS-36B-Instruct | BF16 | 88.19 | 82.97 | 70.36 | 97.12 | 87.20 |
+ +----------------+---------+--------+----------------+------------------+-------------+
| | FP8-Static | 87.82 | 82.79 | 74.75 | 96.51 | 86.59 |
+ +----------------+---------+--------+----------------+------------------+-------------+
| | FP8-Dynamic | 87.82 | 82.64 | 74.15 | 96.89 | 87.20 |
+-------------------------+----------------+---------+--------+----------------+------------------+-------------+

```

该数据使用[lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)工具评测,注意需要设置`--gen_kwargs max_gen_toks`防止思考内容过长被截断。


## 其他模型

Expand Down