ModelTC
diff --git a/‎configs/quantization/methods/MixPrecision/awq_w_a.yml‎
Lines changed: 52 additions & 0 deletions b/‎configs/quantization/methods/MixPrecision/awq_w_a.yml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎configs/quantization/methods/MixPrecision/awq_w_a_static.yml‎
Lines changed: 49 additions & 0 deletions b/‎configs/quantization/methods/MixPrecision/awq_w_a_static.yml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎configs/quantization/methods/MixPrecision/rtn_w_a.yml‎
Lines changed: 38 additions & 0 deletions b/‎configs/quantization/methods/MixPrecision/rtn_w_a.yml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎configs/quantization/methods/MixPrecision/rtn_w_a_static.yml‎
Lines changed: 44 additions & 0 deletions b/‎configs/quantization/methods/MixPrecision/rtn_w_a_static.yml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎llmc/compression/quantization/auto_clip.py‎
Lines changed: 8 additions & 53 deletions b/‎llmc/compression/quantization/auto_clip.py‎
Lines changed: 8 additions & 53 deletions
@@ -0,0 +1,52 @@
+base:
+    seed: &seed 42
+model:
+    type: DeepseekV2
+    path: /path/to/DeepseekV2
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: /path/to/pileval
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: /path/to/wikitext2
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: Awq
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: False
+        clip_sym: True
+ignored_layers:
+    # block_ids and layer_names together determine which layers use high precision (such as bf16 or fp16) for computation.
+    # For example, '4' and 'self_attn.q_proj' represent the model.layers.4.mlp.self_attn.q_proj layer using high precision,
+    # while '15-23' and 'self_attn.kv_b_proj' represent layers 15 to 23 of self_attn.kv_b_proj not being quantized.
+    block_ids: [4, 5, 6, 15-23]
+    layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
+    # You can also specify certain layers for high precision computation using speical_names,
+    # but you must provide the full name of the layer
+    speical_names: ["model.layers.0.mlp.down_proj"]
+save:
+    save_vllm: False
+    save_fake: False
+    save_path: /path/to/save/
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: DeepseekV2
+    path: /path/to/DeepseekV2
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: /path/to/pileval
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: /path/to/wikitext2
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: Awq
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_tensor
+        static: True
+        calib_algo: static_hist
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: False
+        clip_sym: True
+ignored_layers:
+    block_ids: [0-26]
+    layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
+    speical_names: []
+save:
+    save_vllm: False
+    save_fake: False
+    save_path: /path/to/save/
@@ -0,0 +1,38 @@
+base:
+    seed: &seed 42
+model:
+    type: DeepseekV2
+    path: /path/to/DeepseekV2
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: /path/to/wikitext2
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+ignored_layers:
+    # block_ids and layer_names together determine which layers use high precision (such as bf16 or fp16) for computation.
+    # For example, '4' and 'self_attn.q_proj' represent the model.layers.4.mlp.self_attn.q_proj layer using high precision,
+    # while '15-23' and 'self_attn.kv_b_proj' represent layers 15 to 23 of self_attn.kv_b_proj not being quantized.
+    block_ids: [4, 5, 6, 15-23]
+    layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
+    # You can also specify certain layers for high precision computation using speical_names,
+    # but you must provide the full name of the layer
+    speical_names: ["model.layers.0.mlp.down_proj"]
+save:
+    save_vllm: False
+    save_fake: False
+    save_path: /path/to/save/
@@ -0,0 +1,44 @@
+base:
+    seed: &seed 42
+model:
+    type: DeepseekV2
+    path: /path/to/DeepseekV2
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: /path/to/pileval
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: /path/to/wikitext2
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_tensor
+        static: True
+        calib_algo: static_hist
+ignored_layers:
+    block_ids: [0-26]
+    layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
+    speical_names: []
+save:
+    save_vllm: False
+    save_fake: False
+    save_path: /path/to/save/
@@ -6,8 +6,7 @@
 from loguru import logger
 
 from .module_utils import _LLMC_LINEAR_TYPES_, _TRANSFORMERS_LINEAR_TYPES_
-from .utils import (check_do_quant, check_w_only, get_aquantizer,
-                    get_wquantizer, is_fp8_supported_gpu)
+from .utils import is_fp8_supported_gpu
 
 if is_fp8_supported_gpu():
     from .kernel import weight_cast_to_bf16, weight_cast_to_fp8
@@ -21,17 +20,13 @@ class AutoClipper:
     def __init__(
         self,
         w_only,
-        mix_bits_map,
-        quantizer_mix_bits,
         wquantizer,
         aquantizer,
         clip_version,
         clip_sym,
         save_clip,
         padding_mask,
     ):
-        self.mix_bits_map = mix_bits_map
-        self.quantizer_mix_bits = quantizer_mix_bits
         self.wquantizer = wquantizer
         self.aquantizer = aquantizer
         self.clip_version = clip_version
@@ -45,14 +40,6 @@ def __init__(
     @torch.no_grad()
     def run(self, block, block_idx, input_feat, n_sample_token):
         for n, m in block.named_modules():
-            if not check_do_quant(
-                block_idx, n, self.mix_bits_map, self.quantizer_mix_bits
-            ):
-                logger.info(
-                    f'This layer {n} in {block_idx}-th block is set to float.'
-                    f'No need to clip this layer.'
-                )
-                continue
             if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)):
                 if m.weight.data.dtype == torch.float8_e4m3fn:
                     is_fp8_weight = True
@@ -105,15 +92,8 @@ def auto_clip_layer(
 
         assert w.dim() == 2
 
-        wquantizer = get_wquantizer(
-            block_idx,
-            layer_name,
-            self.mix_bits_map,
-            self.quantizer_mix_bits,
-            self.wquantizer,
-        )
-        if wquantizer.granularity == 'per_group':
-            group_size = wquantizer.group_size
+        if self.wquantizer.granularity == 'per_group':
+            group_size = self.wquantizer.group_size
         else:
             group_size = w.shape[1]
 
@@ -143,13 +123,7 @@ def auto_clip_layer(
             org_out_dict = {}
             for i_s in range(int(max_shrink * n_grid)):
                 if i_s == 0:
-                    if self.clip_version == 'v2' and not check_w_only(
-                        block_idx,
-                        layer_name,
-                        self.mix_bits_map,
-                        self.quantizer_mix_bits,
-                        self.w_only,
-                    ):
+                    if self.clip_version == 'v2' and not self.w_only:
                         i_s += eps
                 err_mean = 0
                 for i in range(len(inputs)):
@@ -254,15 +228,8 @@ def apply_clip(self, block_idx, layer, min_val, max_val, layer_name):
             raise Exception('Not support other clip version')
 
     def get_clip_factor(self, block_idx, layer, min_val, max_val, layer_name):
-        wquantizer = get_wquantizer(
-            block_idx,
-            layer_name,
-            self.mix_bits_map,
-            self.quantizer_mix_bits,
-            self.wquantizer,
-        )
-        org_min_val, org_max_val = wquantizer.get_minmax_range(
-            wquantizer.reshape_tensor(layer.weight.data)
+        org_min_val, org_max_val = self.wquantizer.get_minmax_range(
+            self.wquantizer.reshape_tensor(layer.weight.data)
         )
         org_val_shape = org_max_val.shape
 
@@ -304,20 +271,8 @@ def fake_quantize_weight(self, w, min_val, max_val, org_min_val, org_max_val):
         return q_w
 
     def fake_quantize_input(self, block_idx, x, layer_name):
-        if not check_w_only(
-            block_idx,
-            layer_name,
-            self.mix_bits_map,
-            self.quantizer_mix_bits,
-            self.w_only,
-        ):
-            q_x = get_aquantizer(
-                block_idx,
-                layer_name,
-                self.mix_bits_map,
-                self.quantizer_mix_bits,
-                self.aquantizer,
-            ).fake_quant_act_dynamic(x)
+        if not self.w_only:
+            q_x = self.aquantizer.fake_quant_act_dynamic(x)
         else:
             q_x = x
         return q_x