diff --git a/.pylintrc b/.pylintrc index ad37324f..4effcbf7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -63,9 +63,9 @@ ignore-patterns=^\.# # (useful for modules/projects where namespaces are manipulated during runtime # and thus existing member attributes cannot be deduced by static analysis). It # supports qualified module names, as well as Unix pattern matching. -ignored-modules=auto_gptq, - exllama_kernels, - exllamav2_kernels, +ignored-modules=gptqmodel, + gptqmodel_exllama_kernels, + gptqmodel_exllamav2_kernels, llmcompressor, cutlass_mm, pygraphviz, diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 50ba3276..fc1519f2 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -6,7 +6,6 @@ AIU Spyre spyre Args -AutoGPTQ autoregressive backpropagation bmm @@ -38,8 +37,9 @@ frac gptq GPTQ GPTQArguments +GPTQModel +gptqmodel graphviz -GPTQ hyperparameters Inductor inferenced diff --git a/README.md b/README.md index 21b1adcb..a27e36a9 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ FMS Model Optimizer is a framework for developing reduced precision neural netwo *Optional packages based on optimization functionality required:* - **GPTQ** is a popular compression method for LLMs: - - [auto_gptq](https://pypi.org/project/auto-gptq/) or build from [source](https://github.com/AutoGPTQ/AutoGPTQ) + - [gptqmodel](https://pypi.org/project/gptqmodel/) or build from [source](https://github.com/ModelCloud/GPTQModel) - If you want to experiment with **INT8** deployment in [QAT](./examples/QAT_INT8/) and [PTQ](./examples/PTQ_INT8/) examples: - Nvidia GPU with compute capability > 8.0 (A100 family or higher) - Option 1: diff --git a/docs/fms_mo_design.md b/docs/fms_mo_design.md index a803a359..efc40a97 100644 --- a/docs/fms_mo_design.md +++ b/docs/fms_mo_design.md @@ -82,7 +82,7 @@ FMS Model Optimizer supports FP8 in two ways: ### GPTQ (weight-only compression, or sometimes referred to as W4A16) -For generative LLMs, very often the bottleneck of inference is no longer the computation itself but the data transfer. In such case, all we need is an efficient compression method to reduce the model size in memory, together with an efficient GPU kernel that can bring in the compressed data and only decompress it at GPU cache-level right before performing an FP16 computation. This approach is very powerful because it could reduce the number of GPUs for serving the model by 4X without sacrificing inference speed. (Some constraints may apply, such as batch size cannot exceed a certain number.) FMS Model Optimizer supports this method simply by utilizing `auto_gptq` package. See this [example](../examples/GPTQ/) +For generative LLMs, very often the bottleneck of inference is no longer the computation itself but the data transfer. In such case, all we need is an efficient compression method to reduce the model size in memory, together with an efficient GPU kernel that can bring in the compressed data and only decompress it at GPU cache-level right before performing an FP16 computation. This approach is very powerful because it could reduce the number of GPUs for serving the model by 4X without sacrificing inference speed. (Some constraints may apply, such as batch size cannot exceed a certain number.) FMS Model Optimizer supports this method simply by utilizing `gptqmodel` package. See this [example](../examples/GPTQ/) ## Specification diff --git a/examples/GPTQ/README.md b/examples/GPTQ/README.md index 525cde80..a9ffb708 100644 --- a/examples/GPTQ/README.md +++ b/examples/GPTQ/README.md @@ -1,12 +1,12 @@ # Generative Pre-Trained Transformer Quantization (GPTQ) of LLAMA-3-8B Model -For generative LLMs, very often the bottleneck of inference is no longer the computation itself but the data transfer. In such case, all we need is an efficient compression method to reduce the model size in memory, together with an efficient GPU kernel that can bring in the compressed data and only decompress it at GPU cache-level right before performing an FP16 computation. This approach is very powerful because it could reduce the number of GPUs for serving the model by 4X without sacrificing inference speed (some constraints may apply, such as batch size cannot exceed a certain number.) FMS Model Optimizer supports this "weight-only compression", or sometimes referred to as W4A16 or [GPTQ](https://arxiv.org/pdf/2210.17323) by leveraging `auto_gptq`, a third party library, to perform quantization. +For generative LLMs, very often the bottleneck of inference is no longer the computation itself but the data transfer. In such case, all we need is an efficient compression method to reduce the model size in memory, together with an efficient GPU kernel that can bring in the compressed data and only decompress it at GPU cache-level right before performing an FP16 computation. This approach is very powerful because it could reduce the number of GPUs for serving the model by 4X without sacrificing inference speed (some constraints may apply, such as batch size cannot exceed a certain number.) FMS Model Optimizer supports this "weight-only compression", or sometimes referred to as W4A16 or [GPTQ](https://arxiv.org/pdf/2210.17323) by leveraging `gptqmodel`, a third party library, to perform quantization. ## Requirements - [FMS Model Optimizer requirements](../../README.md#requirements) -- `auto-gptq` is needed for this example. Use `pip install auto-gptq` or [install from source](https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#install-from-source) +- `gptqmodel` is needed for this example. Use `pip install gptqmodel` or [install from source](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file) - Optionally for the evaluation section below, install [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) ``` pip install lm-eval @@ -32,7 +32,7 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m > - Tokenized data will be saved in `_train` and `_test` > - If you have trouble downloading Llama family of models from Hugging Face ([LLama models require access](https://www.llama.com/docs/getting-the-models/hugging-face/)), you can use `ibm-granite/granite-8b-code` instead -2. **Quantize the model** using the data generated above, the following command will kick off the quantization job (by invoking `auto_gptq` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127). +2. **Quantize the model** using the data generated above, the following command will kick off the quantization job (by invoking `gptqmodel` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127). ```bash python -m fms_mo.run_quant \ @@ -49,8 +49,8 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m > - In GPTQ, `group_size` is a trade-off between accuracy and speed, but there is an additional constraint that `in_features` of the Linear layer to be quantized needs to be an **integer multiple** of `group_size`, i.e. some models may have to use smaller `group_size` than default. > [!TIP] -> 1. If you see error messages regarding `exllama_kernels` or `undefined symbol`, try install `auto-gptq` from [source](https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#install-from-source). -> 2. If you need to work on a custom model that is not supported by AutoGPTQ, please add your class wrapper [here](../../fms_mo/utils/custom_gptq_models.py). Additional information [here](https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#customize-model). +> 1. If you see error messages regarding `exllama_kernels` or `undefined symbol`, try installing `gptqmodel` from [source](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file). +> 2. If you need to work on a custom model that is not supported by GPTQModel, please add your class wrapper [here](../../fms_mo/utils/custom_gptq_models.py). Additional information [here](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file#how-to-add-support-for-a-new-model). 3. **Inspect the GPTQ checkpoint** ```python @@ -62,10 +62,10 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ``` layer mem (MB) - dtype - torch.float16 224 109.051904 - torch.float32 67 4203.757568 - torch.int32 672 3521.904640 + dtype + torch.bfloat16 67 2101.878784 + torch.float16 224 109.051904 + torch.int32 672 3521.904640 ``` 4. **Evaluate the quantized model**'s performance on a selected task using `lm-eval` library, the command below will run evaluation on [`lambada_openai`](https://huggingface.co/datasets/EleutherAI/lambada_openai) task and show the perplexity/accuracy at the end. @@ -82,29 +82,23 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ## Example Test Results - Unquantized Model -```bash - |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| - |------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:| - | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.7103|± |0.0063| - | | | |none | 5|perplexity|↓ |3.7915|± |0.0727| -``` +|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +|------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:| +| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.7103|± |0.0063| +| | | |none | 5|perplexity|↓ |3.7915|± |0.0727| - Quantized model with the settings showed above (`desc_act` default to False.) -```bash - |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| - |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| - | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.4271 |± |0.0069| - | | | |none | 5|perplexity|↓ |39.2316|± |2.2090| -``` - +|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| +| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6365 |± |0.0067| +| | | |none | 5|perplexity|↓ |5.9307 |± |0.1830| - Quantized model with `desc_act` set to `True` (could improve the model quality, but at the cost of inference speed.) -```bash - |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| - |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| - | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| - | | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| -``` +|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| +| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| +| | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| + > [!NOTE] > There is some randomness in generating the model and data, the resulting accuracy may vary ~$\pm$ 0.05. @@ -114,21 +108,25 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m 1. Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py) ```python - from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig - quantize_config = BaseQuantizeConfig( - bits=gptq_args.bits, - group_size=gptq_args.group_size, - desc_act=gptq_args.desc_act, - damp_percent=gptq_args.damp_percent) + from gptqmodel import GPTQModel, QuantizeConfig + + quantize_config = QuantizeConfig( + bits=gptq_args.bits, + group_size=gptq_args.group_size, + desc_act=gptq_args.desc_act, + damp_percent=gptq_args.damp_percent, + ) + ``` -2. Load the pre_trained model with `auto_gptq` class/wrapper. Tokenizer is optional because we already tokenized the data in a previous step. +2. Load the pre_trained model with `gptqmodel` class/wrapper. Tokenizer is optional because we already tokenized the data in a previous step. ```python - model = AutoGPTQForCausalLM.from_pretrained( - model_args.model_name_or_path, - quantize_config=quantize_config, - torch_dtype=model_args.torch_dtype) + model = GPTQModel.from_pretrained( + model_args.model_name_or_path, + quantize_config=quantize_config, + torch_dtype=model_args.torch_dtype, + ) ``` 3. Load the tokenized dataset from disk. @@ -143,9 +141,9 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ```python model.quantize( data, - use_triton=gptq_args.use_triton, + backend=BACKEND.TRITON if gptq_args.use_triton else BACKEND.AUTO, batch_size=gptq_args.batch_size, - cache_examples_on_gpu=gptq_args.cache_examples_on_gpu, + calibration_enable_gpu_cache=gptq_args.cache_examples_on_gpu, ) ``` diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py index 76898443..0d5fffb0 100644 --- a/fms_mo/custom_ext_kernels/utils.py +++ b/fms_mo/custom_ext_kernels/utils.py @@ -14,7 +14,7 @@ """This file contains external kernel registrations, compilation, and packing functions. -Some functions may require additional packages, e.g. auto_gptq, cutlass (source clone) +Some functions may require additional packages, e.g. gptqmodel, cutlass (source clone) """ # pylint: disable=ungrouped-imports,unused-argument,c-extension-no-member @@ -491,27 +491,29 @@ def create_test_tensors(Nbatch, M, N, K, ele_type, accum_type): def exllama_ops_load_and_reg(qcfg=None, run_unit_test=False): - """Register Exllama kernels borrowed from auto-gptq + """Register Exllama kernels borrowed from gptqmodel Args: qcfg: dict. quant config run_unit_test: bool. Run unit tests after Op registration. (if unit tests defined.) NOTE: - 1. need to install auto-gptq python package + 1. need to install gptqmodel python package 2. Op registration signature changed drastically from torch 2.1 - 2.4. TODO: add 2.4 support - see https://github.com/AutoGPTQ/AutoGPTQ for installation instruction + see https://github.com/ModelCloud/GPTQModel for installation instructions """ if qcfg is None: qcfg = {} elif qcfg: - qcfg["AUTOGPTQ_AVAILABLE"] = False + qcfg["GPTQMODEL_AVAILABLE"] = False - namespace = "autogptq_gemm" + namespace = "gptqmodel_gemm" # check before compile - if hasattr(torch.ops, namespace) and hasattr(torch.ops.autogptq_gemm, "exv1_i4f16"): - logger.info("Custom AutoGPTQ functions have been loaded already!") - qcfg["AUTOGPTQ_AVAILABLE"] = True + if hasattr(torch.ops, namespace) and hasattr( + torch.ops.gptqmodel_gemm, "exv1_i4f16" + ): + logger.info("Custom GPTQModel functions have been loaded already!") + qcfg["GPTQMODEL_AVAILABLE"] = True need_registration = False else: need_registration = ( @@ -521,14 +523,14 @@ def exllama_ops_load_and_reg(qcfg=None, run_unit_test=False): if not need_registration: logger.warning( - "Please check the installation of AutoGPTQ package." + "Please check the installation of GPTQModel package." "External kernels cannot be used this time." ) return # Third Party - import exllama_kernels - import exllamav2_kernels + import gptqmodel_exllama_kernels + import gptqmodel_exllamav2_kernels # Register op @reg_op(f"{namespace}::exv1_i4f16") @@ -545,7 +547,7 @@ def exv1_i4f16_impl(x, q4, q4_width): (x.shape[0], q4_width), dtype=torch.float16, device=x.device ) - exllama_kernels.q4_matmul(x, q4, output) + gptqmodel_exllama_kernels.q4_matmul(x, q4, output) return output.view(outshape) # Abstract implementation @@ -573,7 +575,9 @@ def exv2_i4f16_impl(x, q_handle, q4_width, force_cuda): (x.shape[0], q4_width), dtype=torch.float16, device=x.device ) - exllamav2_kernels.gemm_half_q_half(x, q_handle, output, force_cuda) + gptqmodel_exllamav2_kernels.gemm_half_q_half( + x, q_handle, output, force_cuda + ) return output.view(outshape) # Abstract implementation @@ -609,7 +613,9 @@ def exv2_i4f16_fxinputs_impl( (x.shape[0], q4_width), dtype=torch.float16, device=x.device ) - exllamav2_kernels.gemm_half_q_half(x, q_handle, output, force_cuda) + gptqmodel_exllamav2_kernels.gemm_half_q_half( + x, q_handle, output, force_cuda + ) return output.view(outshape) # Abstract implementation @@ -623,10 +629,11 @@ def exv2_i4f16_fxinputs_abstract( ) logger.info( - f"New AutoGPTQ gemm functions have been loaded and registered to torch.ops.{namespace}." + f"New GPTQModel gemm functions have been loaded and registered to \ + torch.ops.{namespace}." ) if qcfg: - qcfg["AUTOGPTQ_AVAILABLE"] = True + qcfg["GPTQMODEL_AVAILABLE"] = True if run_unit_test: return NotImplemented @@ -1171,10 +1178,14 @@ def swap_nnlinear_to_quantlinear(model, qconfig, prefix=None, qlinear2use=None): QuantLinear = qlinear2use elif exVer == 1: # Third Party - from auto_gptq.nn_modules.qlinear.qlinear_exllama import QuantLinear + from gptqmodel.nn_modules.qlinear.exllama import ( + ExllamaQuantLinear as QuantLinear, + ) else: # Third Party - from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear + from gptqmodel.nn_modules.qlinear.exllamav2 import ( + ExllamaV2QuantLinear as QuantLinear, + ) num_swapped = 0 for n, m in model.named_modules(): diff --git a/fms_mo/fx/utils.py b/fms_mo/fx/utils.py index 302d9924..357a877b 100644 --- a/fms_mo/fx/utils.py +++ b/fms_mo/fx/utils.py @@ -41,9 +41,9 @@ # Local from fms_mo.modules.linear import QLinearExv1WI4AF16, QLinearExv2WI4AF16 - autogptq_available = True + gptqmodel_available = True except ImportError: - autogptq_available = False + gptqmodel_available = False MIN_BLOCK_SIZE = 5 @@ -91,7 +91,7 @@ def check_qclass_fallback_based_on_min_feat( ] if cutlass_available: qclass_has_constraints += [QLinearCutlassI8I32NT] - if autogptq_available: + if gptqmodel_available: qclass_has_constraints += [QLinearExv1WI4AF16, QLinearExv2WI4AF16] qclass = type(ref_module) @@ -129,7 +129,7 @@ def lower_qmodel_to_ext_kernels( 1. user need to define a mapping thru qcfg["ext_kernel_mapping_mod"] 2. to make it simple, only swap user specified qclass, nothing else 3. move the module to GPU before swapping to accelerate scale/zp calculations - 4. autogptq_post_init() must be done at model level, or OOM and incorrect results easily + 4. gptq_post_init() must be done at model level, or OOM and incorrect results easily Args: mod (torch.nn.Module): model to be 'lowered' @@ -156,7 +156,7 @@ def lower_qmodel_to_ext_kernels( qclass_must_start_from_cpu = None using_gptq = False if ( - available_packages["auto_gptq"] + available_packages["gptqmodel"] and available_packages["exllama_kernels"] and available_packages["exllamav2_kernels"] ): @@ -207,9 +207,9 @@ def lower_qmodel_to_ext_kernels( if using_gptq: # Third Party - from auto_gptq.modeling._utils import autogptq_post_init + from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init - mod_tmp = autogptq_post_init(mod_tmp, use_act_order=False) # see Note 4 + mod_tmp = gptq_post_init(mod_tmp, use_act_order=False) # see Note 4 mod.to(currDev) logger.info(mod) diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py index e8197598..b79363f5 100644 --- a/fms_mo/modules/linear.py +++ b/fms_mo/modules/linear.py @@ -1501,14 +1501,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: try: # Third Party - from auto_gptq.nn_modules.qlinear.qlinear_exllama import ( - QuantLinear as QLinearExllamaV1, + from gptqmodel.nn_modules.qlinear.exllama import ( + ExllamaQuantLinear as QLinearExllamaV1, ) - from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import ( - QuantLinear as QLinearExllamaV2, + from gptqmodel.nn_modules.qlinear.exllamav2 import ( + ExllamaV2QuantLinear as QLinearExllamaV2, ) - from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import ext_gemm_half_q_half - from exllama_kernels import prepare_buffers, set_tuning_params + from gptqmodel.nn_modules.qlinear.exllamav2 import ext_gemm_half_q_half + from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params from transformers.pytorch_utils import Conv1D class QLinearExv1WI4AF16(QLinearExllamaV1): @@ -1614,7 +1614,7 @@ def forward(self, x): Tensor: Output tensor of shape (batch_size, out_features). """ with torch.no_grad(): - x = torch.ops.autogptq_gemm.exv1_i4f16(x.half(), self.q4, self.width) + x = torch.ops.gptqmodel_gemm.exv1_i4f16(x.half(), self.q4, self.width) if self.bias is not None: x.add_(self.bias) @@ -1764,7 +1764,7 @@ def from_fms_mo(cls, fms_mo_qlinear, **kwargs): if kwargs.get( "useInductor", False ): # anything other than False or None will use torch wrapped version - qlinear_ex.extOp = torch.ops.autogptq_gemm.exv2_i4f16 + qlinear_ex.extOp = torch.ops.gptqmodel_gemm.exv2_i4f16 else: qlinear_ex.extOp = ext_gemm_half_q_half @@ -1800,7 +1800,7 @@ def forward(self, x, force_cuda=False): except ModuleNotFoundError: logger.warning( - "AutoGPTQ is not properly installed. " + "GPTQModel is not properly installed. " "QLinearExv1WI4AF16 and QLinearExv2WI4AF16 wrappers will not be available." ) diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py index 6cb1f2b8..44878e9d 100644 --- a/fms_mo/run_quant.py +++ b/fms_mo/run_quant.py @@ -86,11 +86,11 @@ def quantize( logger.info(f"{fms_mo_args}\n{opt_args.quant_method}\n") if opt_args.quant_method == "gptq": - if not available_packages["auto_gptq"]: + if not available_packages["gptqmodel"]: raise ImportError( "Quantization method has been selected as gptq but unable to use external library, " - "auto_gptq module not found. For more instructions on installing the appropriate " - "package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation" + "gptqmodel module not found. For more instructions on installing the appropriate " + "package, see https://github.com/ModelCloud/GPTQModel" ) gptq_args.use_triton = gptq_args.use_triton and available_packages["triton"] run_gptq(model_args, data_args, opt_args, gptq_args) @@ -100,7 +100,7 @@ def quantize( "Quantization method has been selected as fp8 but unable to use external library, " "llmcompressor module not found. \n" "For more instructions on installing the appropriate package, see " - "https://github.com/vllm-project/llm-compressor/tree/" + "https://github.com/vllm-project/llm-compressor" "main?tab=readme-ov-file#installation" ) run_fp8(model_args, data_args, opt_args, fp8_args) @@ -126,28 +126,28 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): """ # Third Party - from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig - from auto_gptq.modeling._const import SUPPORTED_MODELS - from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP + from gptqmodel import GPTQModel, QuantizeConfig + from gptqmodel.models.auto import MODEL_MAP, SUPPORTED_MODELS + from gptqmodel.utils.backend import BACKEND # Local from fms_mo.utils.custom_gptq_models import custom_gptq_classes logger = set_log_level(opt_args.log_level, "fms_mo.run_gptq") - quantize_config = BaseQuantizeConfig( + quantize_config = QuantizeConfig( bits=gptq_args.bits, group_size=gptq_args.group_size, desc_act=gptq_args.desc_act, damp_percent=gptq_args.damp_percent, ) - # Add custom model_type mapping to auto_gptq LUT so AutoGPTQForCausalLM can recognize them. + # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them. for mtype, cls in custom_gptq_classes.items(): SUPPORTED_MODELS.append(mtype) - GPTQ_CAUSAL_LM_MODEL_MAP[mtype] = cls + MODEL_MAP[mtype] = cls - model = AutoGPTQForCausalLM.from_pretrained( + model = GPTQModel.from_pretrained( model_args.model_name_or_path, quantize_config=quantize_config, torch_dtype=model_args.torch_dtype, @@ -166,9 +166,9 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): start_time = time.time() model.quantize( data, - use_triton=gptq_args.use_triton, + backend=BACKEND.TRITON if gptq_args.use_triton else BACKEND.AUTO, batch_size=gptq_args.batch_size, - cache_examples_on_gpu=gptq_args.cache_examples_on_gpu, + calibration_enable_gpu_cache=gptq_args.cache_examples_on_gpu, ) logger.info( @@ -176,7 +176,7 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): ) logger.info(f"Saving quantized model and tokenizer to {opt_args.output_dir}") - model.save_quantized(opt_args.output_dir, use_safetensors=True) + model.save_quantized(opt_args.output_dir) tokenizer.save_pretrained(opt_args.output_dir) diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py index ce224521..e7beafc6 100644 --- a/fms_mo/training_args.py +++ b/fms_mo/training_args.py @@ -192,7 +192,7 @@ class FMSMOArguments(TypeChecker): @dataclass class GPTQArguments(TypeChecker): - """Dataclass for GPTQ related arguments that will be used by auto-gptq.""" + """Dataclass for GPTQ related arguments that will be used by gptqmodel.""" bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]}) group_size: int = field(default=-1) diff --git a/fms_mo/utils/custom_gptq_models.py b/fms_mo/utils/custom_gptq_models.py index d83145cb..b17a7ad0 100644 --- a/fms_mo/utils/custom_gptq_models.py +++ b/fms_mo/utils/custom_gptq_models.py @@ -15,16 +15,16 @@ """Allow users to add new GPTQ classes for their custom models easily.""" # Third Party -from auto_gptq.modeling import BaseGPTQForCausalLM +from gptqmodel.models.base import BaseGPTQModel -class GraniteGPTQForCausalLM(BaseGPTQForCausalLM): +class GraniteGPTQForCausalLM(BaseGPTQModel): """Enable Granite for GPTQ.""" layer_type = "GraniteDecoderLayer" - layers_block_name = "model.layers" - outside_layer_modules = ["model.embed_tokens", "model.norm"] - inside_layer_modules = [ + layers_node = "model.layers" + base_modules = ["model.embed_tokens", "model.norm"] + layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], ["mlp.up_proj", "mlp.gate_proj"], @@ -32,13 +32,13 @@ class GraniteGPTQForCausalLM(BaseGPTQForCausalLM): ] -class GraniteMoeGPTQForCausalLM(BaseGPTQForCausalLM): +class GraniteMoeGPTQForCausalLM(BaseGPTQModel): """Enable Granite MOE for GPTQ.""" layer_type = "GraniteMoeDecoderLayer" - layers_block_name = "model.layers" - outside_layer_modules = ["model.embed_tokens", "model.norm"] - inside_layer_modules = [ + layers_node = "model.layers" + base_modules = ["model.embed_tokens", "model.norm"] + layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], ["block_sparse_moe.input_linear", "block_sparse_moe.output_linear"], diff --git a/fms_mo/utils/import_utils.py b/fms_mo/utils/import_utils.py index bb13afff..f4b1538d 100644 --- a/fms_mo/utils/import_utils.py +++ b/fms_mo/utils/import_utils.py @@ -21,9 +21,9 @@ import torch optional_packages = [ - "auto_gptq", - "exllama_kernels", - "exllamav2_kernels", + "gptqmodel", + "gptqmodel_exllama_kernels", + "gptqmodel_exllamav2_kernels", "llmcompressor", "mx", "matplotlib", diff --git a/pyproject.toml b/pyproject.toml index 440d0be5..6df9523a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ dependencies = [ [project.optional-dependencies] dev = ["pre-commit>=3.0.4,<5.0"] fp8 = ["llmcompressor"] -gptq = ["auto_gptq>0.4.2", "optimum>=1.15.0"] +gptq = ["Cython", "gptqmodel>=1.7.3"] mx = ["microxcaling>=1.1"] visualize = ["matplotlib", "graphviz", "pygraphviz"] flash-attn = ["flash-attn>=2.5.3,<3.0"] diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py index 1a67688f..54f8d3c3 100644 --- a/tests/build/test_launch_script.py +++ b/tests/build/test_launch_script.py @@ -86,8 +86,8 @@ def cleanup_env(): @pytest.mark.skipif( - not available_packages["auto_gptq"], - reason="Only runs if auto-gptq package is installed", + not available_packages["gptqmodel"], + reason="Only runs if gptqmodel package is installed", ) def test_successful_gptq(): """Check if we can gptq models""" @@ -254,7 +254,7 @@ def _validate_quantization_output(base_dir, quant_method): # Check quantized model files exist if quant_method == "gptq": - assert len(glob.glob(os.path.join(base_dir, "gptq_model-*.safetensors"))) > 0 + assert len(glob.glob(os.path.join(base_dir, "model*.safetensors"))) > 0 assert os.path.exists(os.path.join(base_dir, "quantize_config.json")) is True assert os.path.exists(os.path.join(base_dir, "config.json")) is True