Merge branch 'foundation-model-stack:main' into main

chichun-charlie-liu · web-flow · commit dceb63b87fd2 · 2025-07-23T23:56:19.000-04:00
diff --git a/examples/GPTQ/README.md b/examples/GPTQ/README.md
@@ -7,6 +7,7 @@ For generative LLMs, very often the bottleneck of inference is no longer the com
 
 - [FMS Model Optimizer requirements](../../README.md#requirements)
 - `gptqmodel` is needed for this example. Use `pip install gptqmodel` or [install from source](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file)
+    - It is advised to install from source if you plan to use `GPTQv2`
 - Optionally for the evaluation section below, install [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)
     ```
     pip install lm-eval
@@ -32,7 +33,7 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m
 > - Tokenized data will be saved in `<path_to_save>_train` and `<path_to_save>_test`
 > - If you have trouble downloading Llama family of models from Hugging Face ([LLama models require access](https://www.llama.com/docs/getting-the-models/hugging-face/)), you can use `ibm-granite/granite-8b-code` instead
 
-2. **Quantize the model** using the data generated above, the following command will kick off the quantization job (by invoking `gptqmodel` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127).
+2. **Quantize the model** using the data generated above, the following command will kick off the `GPTQv1' quantization job (by invoking `gptqmodel` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127).
 
     ```bash
     python -m fms_mo.run_quant \
@@ -41,9 +42,10 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m
         --quant_method gptq \
         --output_dir Meta-Llama-3-8B-GPTQ \
         --bits 4 \
-        --group_size 128
+        --group_size 128 \ 
+
     ```
-    The model that can be found in the specified output directory (`Meta-Llama-3-8B-GPTQ` in our case) can be deployed and inferenced via `vLLM`.
+    The model that can be found in the specified output directory (`Meta-Llama-3-8B-GPTQ` in our case) can be deployed and inferenced via `vLLM`. To enable `GPTQv2`, set the `quant_method` argument to `gptqv2`. 
 
 > [!NOTE]
 > - In GPTQ, `group_size` is a trade-off between accuracy and speed, but there is an additional constraint that `in_features` of the Linear layer to be quantized needs to be an **integer multiple** of `group_size`, i.e. some models may have to use smaller `group_size` than default.
@@ -82,44 +84,67 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m
 ## Example Test Results
 
 - Unquantized Model
-- 
-|Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
-|------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:|
-| LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.7103|±  |0.0063|
-|            |              |       |none  |     5|perplexity|↓  |3.7915|±  |0.0727|
+
+        |Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value |   |Stderr|
+        |------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:|
+        | LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.7103|±  |0.0063|
+        |            |              |       |none  |     5|perplexity|↓  |3.7915|±  |0.0727|
 
 - Quantized model with the settings showed above (`desc_act` default to False.)
-- 
-|Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value  |   |Stderr|
-|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:|
-| LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.6365 |±  |0.0067|
-|            |              |       |none  |     5|perplexity|↓  |5.9307 |±  |0.1830|
+    - `GPTQv1`
+
+        |Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value  |   |Stderr|
+        |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:|
+        | LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.6365 |±  |0.0067|
+        |            |              |       |none  |     5|perplexity|↓  |5.9307 |±  |0.1830|
+
+    - `GPTQv2`
+
+        |Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value  |   |Stderr|
+        |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:|
+        | LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.6817 |±  |0.0065|
+        |            |              |       |none  |     5|perplexity|↓  |4.3994 |±  |0.0995|
 
 - Quantized model with `desc_act` set to `True` (could improve the model quality, but at the cost of inference speed.)
-- 
-|Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value  |   |Stderr|
-|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:|
-| LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.6193 |±  |0.0068|
-|            |              |       |none  |     5|perplexity|↓  |5.8879 |±  |0.1546|
+    - `GPTQv1` 
+        |Model       |    Tasks     |Version|Filter|n-shot|  Metric  |   |Value  |   |Stderr|
+        |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:|
+        | LLAMA3-8B  |lambada_openai|      1|none  |     5|acc       |↑  |0.6193 |±  |0.0068|
+        |            |              |       |none  |     5|perplexity|↓  |5.8879 |±  |0.1546|
 
 > [!NOTE]
 > There is some randomness in generating the model and data, the resulting accuracy may vary ~$\pm$ 0.05.
 
 
 ## Code Walk-through
 
-1.  Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py)
+1.  Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py). `GPTQv1` and `GPTQv2` is supported. 
 
-    ```python
-    from gptqmodel import GPTQModel, QuantizeConfig
+    - To use `GPTQv1`, set the parameter `quant_method` to `gptq` in the command line. 
 
-    quantize_config = QuantizeConfig(
-        bits=gptq_args.bits,
-        group_size=gptq_args.group_size,
-        desc_act=gptq_args.desc_act,
-        damp_percent=gptq_args.damp_percent,
-    )
+    ```python
+        from gptqmodel import GPTQModel, QuantizeConfig
+
+        quantize_config = QuantizeConfig(
+            bits=gptq_args.bits,
+            group_size=gptq_args.group_size,
+            desc_act=gptq_args.desc_act,
+            damp_percent=gptq_args.damp_percent,
+            )
+    ```
+    - To use `GPTQv2`, simply set  `quant_method` to `gptqv2`in the command line. Under the hood, two additional arguments will be added to QuantizeConfig, i.e. `v2` = `True` and `v2_memory_device` = `cpu`.
 
+    ```python
+        from gptqmodel import GPTQModel, QuantizeConfig
+
+        quantize_config = QuantizeConfig(
+            bits=gptq_args.bits,
+            group_size=gptq_args.group_size,
+            desc_act=gptq_args.desc_act,
+            damp_percent=gptq_args.damp_percent,
+            v2=True,
+            v2_memory_device='cpu',
+            )
     ```
 
 2. Load the pre_trained model with `gptqmodel` class/wrapper. Tokenizer is optional because we already tokenized the data in a previous step.
@@ -158,4 +183,4 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m
     tokenizer.save_pretrained(output_dir) # optional
     ```
 > [!NOTE]
-> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100.
+> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100 with `GPTQv1`.
diff --git a/fms_mo/aiu_addons/fp8/fp8_attn.py b/fms_mo/aiu_addons/fp8/fp8_attn.py
@@ -318,12 +318,31 @@ def _spyre_scaled_paged_compute_op(
             attn_kwargs["block_table"],
         )
 
+    def __spyre_scaled_paged_validate_attn_kwargs_op(
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_value_states: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None,
+        **attn_kwargs,
+    ):
+        __spyre_paged_validate_attn_kwargs_op(
+            input_ids, position_ids, past_key_value_states, **attn_kwargs
+        )
+
+        if past_key_value_states is not None:
+            for k, v in past_key_value_states:
+                assert isinstance(k, ScaledTensor)
+                assert isinstance(v, ScaledTensor)
+
+                # assert that for each layer, the scales are per-sequence
+                assert k._scale.shape[0] == input_ids.shape[0]
+                assert v._scale.shape[0] == input_ids.shape[0]
+
     register_attention_op(
         "spyre_paged_attn_fp8",
         _spyre_scaled_paged_store_op,
         compute_op=_math_fp8_compute_op,
         is_prefill_op=lambda **attn_kwargs: attn_kwargs.get("block_table", None)
         is None,
         compute_decode_op=_spyre_scaled_paged_compute_op,
-        validate_attn_kwargs_op=__spyre_paged_validate_attn_kwargs_op,
+        validate_attn_kwargs_op=__spyre_scaled_paged_validate_attn_kwargs_op,
     )
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -123,23 +123,28 @@ def get_activation_quantizer(
             )
         elif qa_mode == "dorefa":
             act_quantizer = dorefa_quantize_activation
-        elif (
-            qa_mode == "max"
-        ):  # NOTE Need to be careful using this for activation, particular to 1 sided.
-            act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False)
-        elif qa_mode == "minmax":
-            act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True)
+
+        elif "max" in qa_mode:
+            # NOTE Need to be careful using this for activation, particular to 1 sided.
+            if "min" in qa_mode:
+                act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True)
+            elif "pertoken" in qa_mode or "perToken" in qa_mode:
+                act_quantizer = QMaxDynamic(nbits, dim=-1)
+            elif "per_channel" in qa_mode or "perCh" in qa_mode:
+                act_quantizer = QMaxDynamic(nbits, dim=-2)
+            elif "sym" in qa_mode:
+                act_quantizer = Qmax(
+                    nbits,
+                    align_zero=True,
+                    minmax=False,
+                    extend_act_range=extend_act_range,
+                )
+            else:
+                act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False)
         elif qa_mode == "fix":
             act_quantizer = QFixSymmetric(
                 nbits, init_clip_val=clip_val, align_zero=align_zero
             )
-        elif qa_mode == "maxsym":
-            act_quantizer = Qmax(
-                nbits,
-                align_zero=True,
-                minmax=False,
-                extend_act_range=extend_act_range,
-            )
         elif qa_mode == "pactsym":
             act_quantizer = PACT2Sym(
                 nbits,
@@ -179,8 +184,6 @@ def get_activation_quantizer(
                     perToken=perToken,
                     emulate=True,
                 )
-        elif qa_mode == "pertokenmax":
-            act_quantizer = PerTokenMax(nbits)
         else:
             raise ValueError(f"unrecognized activation quantization mode {qa_mode}")
     else:  # swcap-compatible activation quantizers
@@ -3491,6 +3494,42 @@ def __repr__(self):
         return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)"
 
 
+class QMaxDynamic(nn.Module):
+    def __init__(self, num_bits, dim=-1):
+        """
+        For per-token or per-channel quantization using abs().max() as scale, usually for activation
+        and could be used for Qbmm M2 as well.
+        (reduce) dim = -1 -> abs() will output a column vector (if input is 2D) => per token
+                 dim = -2 -> per-channel
+        Zero is aligned so that the levels are symmetric around zero (lossing one level)
+        Since the token length is un-known before running, the quantizater can only calculate the
+        scales at the run times dynamically, meaning no trainable quantization scales is allowed.
+        (unless input seq length is always the same, not just padded to a fixed length.)
+        """
+        super().__init__()
+        self.num_bits = num_bits
+        self.levels = 2 ** (self.num_bits - 1) - 1
+        if isinstance(dim, str):
+            if "perCh" in dim or "per_channel" in dim:
+                dim = -2
+            elif "perToken" in dim or "per_token" in dim or "per_Token" in dim:
+                dim = -1
+        elif dim in [-1, -2]:
+            self.reduce_dim = dim
+        else:
+            raise ValueError(
+                f"Reduce dim can only be [-1, -2] or ['perCh', 'perToken'] but found {dim}"
+            )
+
+    def forward(self, input_tensor):
+        amax_dim = input_tensor.abs().max(dim=self.reduce_dim, keepdim=True)[0]
+        scales = amax_dim.clamp(min=1e-5).div(self.levels)
+        return input_tensor.div(scales).round().mul(scales)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)"
+
+
 class Qdynamic(nn.Module):
     def __init__(
         self,
diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py
@@ -88,7 +88,7 @@ def quantize(
 
     logger.info(f"{fms_mo_args}\n{opt_args.quant_method}\n")
 
-    if opt_args.quant_method == "gptq":
+    if opt_args.quant_method in ["gptq", "gptqv2"]:
         if not available_packages["gptqmodel"]:
             raise ImportError(
                 "Quantization method has been selected as gptq but unable to use external library, "
@@ -138,12 +138,23 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):
 
     logger = set_log_level(opt_args.log_level, "fms_mo.run_gptq")
 
-    quantize_config = QuantizeConfig(
-        bits=gptq_args.bits,
-        group_size=gptq_args.group_size,
-        desc_act=gptq_args.desc_act,
-        damp_percent=gptq_args.damp_percent,
-    )
+    if opt_args.quant_method == "gptq":
+        quantize_config = QuantizeConfig(
+            bits=gptq_args.bits,
+            group_size=gptq_args.group_size,
+            desc_act=gptq_args.desc_act,
+            damp_percent=gptq_args.damp_percent,
+        )
+    else:
+        quantize_config = QuantizeConfig(
+            bits=gptq_args.bits,
+            group_size=gptq_args.group_size,
+            desc_act=gptq_args.desc_act,
+            damp_percent=gptq_args.damp_percent,
+            v2=True,
+            v2_memory_device="cpu",
+        )
+
 
     # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them.
     for mtype, cls in custom_gptq_classes.items():
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -138,7 +138,10 @@ class OptArguments(TypeChecker):
     """Dataclass for optimization related arguments."""
 
     quant_method: str = field(
-        metadata={"choices": ["gptq", "fp8", "dq"], "help": "Quantization technique"}
+        metadata={
+            "choices": ["gptq", "gptqv2", "fp8", "dq"], 
+            "help": "Quantization technique"
+        }
     )
     output_dir: str = field(
         metadata={
@@ -226,6 +229,7 @@ class GPTQArguments(TypeChecker):
     cache_examples_on_gpu: bool = True
 
 
+
 @dataclass
 class FP8Arguments(TypeChecker):
     """Dataclass for FP8 related arguments that will be used by llm-compressor."""
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,9 +23,9 @@ classifiers=[
 dynamic = ["version"]
 dependencies = [
 "numpy>=1.26.4,<2.3.0",
-"accelerate>=0.20.3,!=0.34,<1.9",
+"accelerate>=0.20.3,!=0.34,<1.10",
 "transformers>=4.45,<4.54",
-"torch>=2.2.0,<2.6",
+"torch>=2.2.0,<2.8", 
 "tqdm>=4.66.2,<5.0",
 "datasets>=3.0.0,<5.0",
 "pandas",