FIX Broken tests with torchao >= 0.16 (#3101)

BenjaminBossan · web-flow · commit 2d488820ab03 · 2026-03-31T14:27:38.000+02:00
Torchao made some API changes, which have to be reflected in the tests. Moreover, for this to pass, we also need transformers to make the corresponding adjustments: huggingface/transformers#44604 While working on this, I migrated the tests from unittest to pytest style.
diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
@@ -258,9 +258,10 @@ PEFT supports models quantized with [torchao](https://github.com/pytorch/ao) ("a
 ```python
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, TorchAoConfig
+from torchao.quantization import Int8WeightOnlyConfig
 
 model_id = ...
-quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+quantization_config = TorchAoConfig(quant_type=Int8WeightOnlyConfig())
 base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
 peft_config = LoraConfig(...)
 model = get_peft_model(base_model, peft_config)
diff --git a/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb b/examples/sequence_classification/LoRA-torchao-8bit-dynamic-activation.ipynb
@@ -29,6 +29,7 @@
     "import torch\n",
     "from torch.optim import AdamW\n",
     "from torch.utils.data import DataLoader\n",
+    "from torchao.quantization import Int8DynamicActivationInt8WeightConfig\n",
     "from peft import (\n",
     "    get_peft_config,\n",
     "    get_peft_model,\n",
@@ -205,7 +206,7 @@
     }
    ],
    "source": [
-    "quant_config = TorchAoConfig(quant_type=\"int8_dynamic_activation_int8_weight\")\n",
+    "quant_config = TorchAoConfig(quant_type=Int8DynamicActivationInt8WeightConfig())\n",
     "model = AutoModelForSequenceClassification.from_pretrained(\n",
     "    model_name_or_path, return_dict=True, device_map=0, dtype=torch.bfloat16, quantization_config=quant_config\n",
     ")"
diff --git a/examples/sequence_classification/LoRA-torchao-8bit.ipynb b/examples/sequence_classification/LoRA-torchao-8bit.ipynb
@@ -29,6 +29,7 @@
     "import torch\n",
     "from torch.optim import AdamW\n",
     "from torch.utils.data import DataLoader\n",
+    "from torchao.quantization import Int8WeightOnlyConfig\n",
     "from peft import (\n",
     "    get_peft_config,\n",
     "    get_peft_model,\n",
@@ -205,7 +206,7 @@
     }
    ],
    "source": [
-    "quant_config = TorchAoConfig(quant_type=\"int8_weight_only\")\n",
+    "quant_config = TorchAoConfig(quant_type=Int8WeightOnlyConfig())\n",
     "model = AutoModelForSequenceClassification.from_pretrained(\n",
     "    model_name_or_path, return_dict=True, device_map=0, dtype=torch.bfloat16, quantization_config=quant_config\n",
     ")"
diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
@@ -128,7 +128,7 @@ def is_torchao_available():
     if importlib.util.find_spec("torchao") is None:
         return False
 
-    TORCHAO_MINIMUM_VERSION = packaging.version.parse("0.4.0")
+    TORCHAO_MINIMUM_VERSION = packaging.version.parse("0.16.0")
     try:
         torchao_version = packaging.version.parse(importlib_metadata.version("torchao"))
     except importlib_metadata.PackageNotFoundError:
diff --git a/src/peft/tuners/lora/torchao.py b/src/peft/tuners/lora/torchao.py
@@ -41,16 +41,11 @@ def __init__(self, *args, get_apply_tensor_subclass, **kwargs):
 
     def _check_dtype_supported(self):
         # TODO: Not required once int4_weight_only is properly supported by torchao
+        from torchao.quantization import Int4Tensor
+
         base_layer = self.get_base_layer()
         weight = base_layer.weight
-        # pytest tests/test_gpu_examples.py::PeftTorchaoGPUTests::test_causal_lm_training_single_gpu_torchao_0_int8_weight_only
-        if (
-            # torchao 0.7.0+
-            (hasattr(weight, "tensor_impl") and (weight.tensor_impl.data.dtype != torch.int8))
-            or
-            # torchao < 0.7.0
-            (hasattr(weight, "layout_tensor") and (weight.layout_tensor.data.dtype != torch.int8))
-        ):
+        if isinstance(weight, Int4Tensor):
             raise ValueError(f"{type(self).__name__} only supports int8 weights for now.")
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
@@ -148,9 +143,9 @@ def dispatch_torchao(
         return new_module
 
     from torchao.dtypes import AffineQuantizedTensor
-    from torchao.quantization import LinearActivationQuantizedTensor
+    from torchao.quantization import Int4Tensor, LinearActivationQuantizedTensor
 
-    if isinstance(target_base_layer.weight, (AffineQuantizedTensor, LinearActivationQuantizedTensor)):
+    if isinstance(target_base_layer.weight, (AffineQuantizedTensor, Int4Tensor, LinearActivationQuantizedTensor)):
         new_module = TorchaoLoraLinear(target, adapter_name, config=config, **kwargs)
 
     return new_module
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
@@ -4251,44 +4251,50 @@ def test_causal_lm_training_multi_gpu_eetq(self):
 
 @require_non_cpu
 @require_torchao
-class PeftTorchaoGPUTests(unittest.TestCase):
-    r"""
-    torchao + peft tests
-    """
-
+class TestPeftTorchao:
+    causal_lm_model_id = "peft-internal-testing/opt-125m"
     supported_quant_types = [
         "int8_weight_only",
         "int8_dynamic_activation_int8_weight",
         # int4_weight_only raises an error:
-        # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
+        # RuntimeError: We encountered some issues during automatic conversion of the weights
         # "int4_weight_only",
     ]
 
-    def setUp(self):
-        self.causal_lm_model_id = "peft-internal-testing/opt-125m"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
-        # torchao breaks with fp16 and if a previous test uses fp16, transformers will set this env var, which affects
-        # subsequent tests, therefore the env var needs to be cleared explicitly
-        #
-        # TODO: remove this once https://github.com/huggingface/transformers/pull/39483 is merged
-        os.environ.pop("ACCELERATE_MIXED_PRECISION", None)
+    @pytest.fixture(scope="class")
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.causal_lm_model_id)
 
-    def tearDown(self):
-        r"""
-        Efficient mechanism to free GPU memory after each test. Based on
-        https://github.com/huggingface/transformers/issues/21094
-        """
+    @pytest.fixture(scope="class", autouse=True)
+    def setup_teardown(self):
+        # Efficient mechanism to free GPU memory after each test. Based on
+        # https://github.com/huggingface/transformers/issues/21094
+        yield
         clear_device_cache(garbage_collection=True)
 
-    @parameterized.expand(supported_quant_types)
+    @staticmethod
+    def get_quant_type(quant_type: str):
+        from torchao.quantization import (
+            Int4WeightOnlyConfig,
+            Int8DynamicActivationInt8WeightConfig,
+            Int8WeightOnlyConfig,
+        )
+
+        return {
+            "int4_weight_only": Int4WeightOnlyConfig(),
+            "int8_weight_only": Int8WeightOnlyConfig(),
+            "int8_dynamic_activation_int8_weight": Int8DynamicActivationInt8WeightConfig(),
+        }[quant_type]
+
+    @pytest.mark.parametrize("quant_type", supported_quant_types)
     @pytest.mark.single_gpu_tests
-    def test_causal_lm_training_single_gpu_torchao(self, quant_type):
+    def test_causal_lm_training_single_gpu_torchao(self, quant_type, tokenizer):
         from transformers import TorchAoConfig
 
         device = 0
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            quantization_config = TorchAoConfig(quant_type=quant_type)
+            quantization_config = TorchAoConfig(quant_type=self.get_quant_type(quant_type))
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
             )
@@ -4305,7 +4311,7 @@ def test_causal_lm_training_single_gpu_torchao(self, quant_type):
             model = get_peft_model(model, config)
 
             data = load_dataset_english_quotes()
-            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 
             trainer = Trainer(
                 model=model,
@@ -4319,7 +4325,7 @@ def test_causal_lm_training_single_gpu_torchao(self, quant_type):
                     logging_steps=1,
                     output_dir=tmp_dir,
                 ),
-                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
             )
             trainer.model.config.use_cache = False
             trainer.train()
@@ -4333,13 +4339,13 @@ def test_causal_lm_training_single_gpu_torchao(self, quant_type):
             assert trainer.state.log_history[-1]["train_loss"] is not None
 
     @pytest.mark.single_gpu_tests
-    def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self):
+    def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self, tokenizer):
         from transformers import TorchAoConfig
 
         device = 0
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            quantization_config = TorchAoConfig(quant_type="int8_weight_only")
+            quantization_config = TorchAoConfig(quant_type=self.get_quant_type("int8_weight_only"))
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
             )
@@ -4357,7 +4363,7 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self):
             model = get_peft_model(model, config)
 
             data = load_dataset_english_quotes()
-            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 
             trainer = Trainer(
                 model=model,
@@ -4371,7 +4377,7 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self):
                     logging_steps=1,
                     output_dir=tmp_dir,
                 ),
-                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
             )
             trainer.model.config.use_cache = False
             trainer.train()
@@ -4390,7 +4396,7 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_dynamic_activation_int8
 
         device = 0
 
-        quantization_config = TorchAoConfig(quant_type="int8_dynamic_activation_int8_weight")
+        quantization_config = TorchAoConfig(quant_type=self.get_quant_type("int8_dynamic_activation_int8_weight"))
         model = AutoModelForCausalLM.from_pretrained(
             self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
         )
@@ -4419,7 +4425,7 @@ def test_causal_lm_training_single_gpu_torchao_int4_raises(self):
 
         device = 0
 
-        quantization_config = TorchAoConfig(quant_type="int4_weight_only")
+        quantization_config = TorchAoConfig(quant_type=self.get_quant_type("int4_weight_only"))
         model = AutoModelForCausalLM.from_pretrained(
             self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
         )
@@ -4441,10 +4447,10 @@ def test_causal_lm_training_single_gpu_torchao_int4_raises(self):
         # tested in multiple matchines
         model(inputs)
 
-    @parameterized.expand(supported_quant_types)
+    @pytest.mark.parametrize("quant_type", supported_quant_types)
     @pytest.mark.multi_gpu_tests
     @require_torch_multi_accelerator
-    def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
+    def test_causal_lm_training_multi_accelerator_torchao(self, quant_type, tokenizer):
         from transformers import TorchAoConfig
 
         device_map = {
@@ -4469,7 +4475,7 @@ def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
         }
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            quantization_config = TorchAoConfig(quant_type=quant_type)
+            quantization_config = TorchAoConfig(quant_type=self.get_quant_type(quant_type))
             model = AutoModelForCausalLM.from_pretrained(
                 self.causal_lm_model_id,
                 device_map=device_map,
@@ -4495,7 +4501,7 @@ def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
             model = get_peft_model(model, config)
 
             data = load_dataset_english_quotes()
-            data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+            data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 
             trainer = Trainer(
                 model=model,
@@ -4509,7 +4515,7 @@ def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
                     logging_steps=1,
                     output_dir=tmp_dir,
                 ),
-                data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
             )
             trainer.model.config.use_cache = False
             trainer.train()
@@ -4550,7 +4556,7 @@ def test_causal_lm_training_multi_accelerator_torchao_int4_raises(self):
             "model.decoder.layers.11": 1,
             "model.decoder.final_layer_norm": 1,
         }
-        quantization_config = TorchAoConfig(quant_type="int4_weight_only")
+        quantization_config = TorchAoConfig(self.get_quant_type(quant_type="int4_weight_only"))
         model = AutoModelForCausalLM.from_pretrained(
             self.causal_lm_model_id,
             device_map=device_map,
@@ -4588,7 +4594,7 @@ def test_torchao_merge_layers_int8_weight_only(self):
         device = 0
         dummy_input = torch.arange(10).view(-1, 1).to(device)
 
-        quantization_config = TorchAoConfig(quant_type=quant_type)
+        quantization_config = TorchAoConfig(self.get_quant_type(quant_type=quant_type))
         model = AutoModelForCausalLM.from_pretrained(
             self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
         ).eval()
@@ -4641,7 +4647,7 @@ def test_torchao_merge_layers_int8_dynamic_activation_int8_weight_raises(self):
         torch.manual_seed(0)
         device = 0
 
-        quantization_config = TorchAoConfig(quant_type=quant_type)
+        quantization_config = TorchAoConfig(quant_type=self.get_quant_type(quant_type))
         model = AutoModelForCausalLM.from_pretrained(
             self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
         ).eval()