From 715930bea6abf01ef4c144cf15cbb424cb705419 Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Mon, 8 Sep 2025 18:24:02 +0800 Subject: [PATCH] add Qwen3-Coder-480B-A35B-Instruct low_memory config --- ...en3_coder-a35b_fp8_dynamic_low_memory.yaml | 27 ++++++++++++++ ...wen3_coder-a35b_fp8_static_low_memory.yaml | 35 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml create mode 100644 configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml diff --git a/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml new file mode 100644 index 00000000..483b6c3c --- /dev/null +++ b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml @@ -0,0 +1,27 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-Coder-480B-A35B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + +# Compression configuration +compression: + name: PTQ + quantization: + low_memory: true + name: fp8_dynamic + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" diff --git a/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml new file mode 100644 index 00000000..e4ee611a --- /dev/null +++ b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml @@ -0,0 +1,35 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-Coder-480B-A35B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + +# Compression configuration +compression: + name: PTQ + quantization: + low_memory: true + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 256 + batch_size: 1