fix brumby compat, thread safety (#2749)

Qubitium · web-flow · commit b8762ffba30e · 2026-04-16T20:35:23.000+08:00
* fix brumby compat

* fix marlin jit test generation

* fix thread pool warmup race

* fix virtual warmup handoff
diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
@@ -42,6 +42,7 @@
 from ..utils.backend import BACKEND
 from ..utils.exllamav3 import build_exllamav3_tensor_storage
 from ..utils.hf import (
+    _normalize_legacy_tied_weights_keys,
     prepare_remote_code_compat,
     sanitize_generation_config_file,
     sanitize_model_config,
@@ -609,6 +610,7 @@ def strip_attention_impl_fields(target: Any) -> Dict[str, Any]:
             removed_config_attention_attrs = strip_attention_impl_fields(self.model.config)
             if generation_config is not None:
                 removed_generation_attention_attrs = strip_attention_impl_fields(generation_config)
+            _normalize_legacy_tied_weights_keys(self.model)
 
             # Save model config, including generation_config
             # Use empty state_dict hack to bypass saving weights
diff --git a/gptqmodel/utils/threadx.py b/gptqmodel/utils/threadx.py
@@ -269,6 +269,56 @@ def __exit__(self, exc_type, exc, tb):
         return self._group.__exit__(exc_type, exc, tb)
 
 
+class _WorkerWarmupState:
+    """
+    Shared once-per-physical-device warmup coordination.
+
+    The first worker that reaches this state performs the warmup. Other workers
+    wait on the completion event without holding the pool registry lock.
+    """
+
+    def __init__(self, warmup_fn: Callable[[torch.device], None]):
+        self._warmup_fn = warmup_fn
+        self._claim_lock = threading.Lock()
+        self._started = False
+        self._done = threading.Event()
+        self._error: Optional[BaseException] = None
+
+    def run(self, *, device: torch.device, rwlock: _RWLock) -> None:
+        if self._done.is_set():
+            self._raise_if_failed()
+            return
+
+        should_run = False
+        with self._claim_lock:
+            if self._done.is_set():
+                pass
+            elif not self._started:
+                self._started = True
+                should_run = True
+
+        if should_run:
+            try:
+                with ctx(rwlock.reader(), _device_ctx(device)):
+                    self._warmup_fn(device)
+            except BaseException as exc:
+                with self._claim_lock:
+                    self._error = exc
+                raise
+            finally:
+                self._done.set()
+        else:
+            self._done.wait()
+
+        self._raise_if_failed()
+
+    def _raise_if_failed(self) -> None:
+        with self._claim_lock:
+            error = self._error
+        if error is not None:
+            raise error
+
+
 # --------------------------- Worker Thread ---------------------------
 # Each worker is bound to a specific device and runs a single thread. Tasks are
 # executed under the device’s read lock; GC acquires the writer lock to keep
@@ -292,15 +342,15 @@ def __init__(
         name: Optional[str] = None,
         inference_mode: bool = False,
         cpu_core: Optional[int] = None,
-        warmup_fn: Optional[Callable[[torch.device], None]] = None,
+        warmup_state: Optional[_WorkerWarmupState] = None,
         *,
         key_override: Optional[str] = None,
     ):
         self.device = device
         self.rwlock = rwlock
         self._on_task_finished = on_task_finished
         self._on_worker_exit = on_worker_exit
-        self._warmup_fn = warmup_fn
+        self._warmup_state = warmup_state
 
         if key_override is not None:
             self.key = key_override
@@ -375,14 +425,11 @@ def _apply_cpu_affinity(self) -> None:
             self._affinity_applied = True
 
     def _run_warmup(self) -> None:
-        warmup_fn = self._warmup_fn
-        if warmup_fn is None:
+        warmup_state = self._warmup_state
+        if warmup_state is None:
             return
-        try:
-            with ctx(self.rwlock.reader(), _device_ctx(self.device)):
-                warmup_fn(self.device)
-        finally:
-            self._warmup_fn = None
+        warmup_state.run(device=self.device, rwlock=self.rwlock)
+        self._warmup_state = None
 
     def _run(self):
         """
@@ -636,7 +683,7 @@ def __init__(
             {str(k).lower(): fn for k, fn in warmups.items()} if warmups else None
         )
         self._warmup_lock = threading.Lock()
-        self._warmup_ran_keys: Set[str] = set()
+        self._warmup_states: Dict[str, _WorkerWarmupState] = {}
 
         workers_cfg = workers or {}
         base_workers: Dict[str, int] = {}
@@ -890,7 +937,11 @@ def _priority(dev_type: str) -> int:
 
         return plan
 
-    def _resolve_worker_warmup(self, dev: torch.device, key: str) -> Optional[Callable[[torch.device], None]]:
+    def _resolve_worker_warmup(
+        self,
+        dev: torch.device,
+        key: str,
+    ) -> Optional[_WorkerWarmupState]:
         mapping = self._worker_warmups
         if not mapping:
             return None
@@ -904,13 +955,14 @@ def _resolve_worker_warmup(self, dev: torch.device, key: str) -> Optional[Callab
         if warmup is None:
             return None
 
-        # Map virtual workers back to their parent key so warmup runs once per physical device.
+        # Virtual workers share the same physical-device warmup state as their parent.
         physical_key = self._virtual_to_parent.get(key, key)
         with self._warmup_lock:
-            if physical_key in self._warmup_ran_keys:
-                return None
-            self._warmup_ran_keys.add(physical_key)
-        return warmup
+            state = self._warmup_states.get(physical_key)
+            if state is None:
+                state = _WorkerWarmupState(warmup)
+                self._warmup_states[physical_key] = state
+        return state
 
     def _spawn_worker(
         self,
@@ -922,7 +974,7 @@ def _spawn_worker(
         """
         Create and start a worker bound to the provided device.
         """
-        warmup_fn = self._resolve_worker_warmup(dev, key)
+        warmup_state = self._resolve_worker_warmup(dev, key)
         w = _DeviceWorker(
             device=dev,
             rwlock=self._locks[key],
@@ -931,7 +983,7 @@ def _spawn_worker(
             name=name,
             inference_mode=self._inference_mode,
             cpu_core=cpu_core,
-            warmup_fn=warmup_fn,
+            warmup_state=warmup_state,
             key_override=key,
         )
         return w
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
@@ -179,7 +179,11 @@ class ModelTest(unittest.TestCase):
     INPUTS_MAX_LENGTH = 2048
     MODEL_MAX_LEN = 4096
     DATASET_SIZE = 512
+    DATASET_SIZE_FAST = None
+    DATASET_SIZE_SLOW = None
     DATASET_CONCAT_SIZE = None
+    DATASET_CONCAT_SIZE_FAST = None
+    DATASET_CONCAT_SIZE_SLOW = None
     DATASET_CONCAT_SEPARATOR = None
     DATASET_SORT = "desc"
     DELETE_QUANTIZED_MODEL = True
@@ -251,6 +255,8 @@ def setUpClass(cls):
     STOP_AFTER_LAYER: Optional[int] = None
     MOE_CONFIG: Optional[MoEConfig] = None
     OFFLOAD_TO_DISK: bool = True
+    OFFLOAD_TO_DISK_FAST = None
+    OFFLOAD_TO_DISK_SLOW = None
 
     GENERIC_TEST_PROMPTS = [
         {"prompt": "Which city is the capital city of France?", "keywords": ["paris"]},
@@ -339,6 +345,14 @@ def _mode_specific_baseline_value(self, attr_name: str):
 
         return self._resolve_metric_baseline_value(getattr(self, attr_name, None))
 
+    def _mode_specific_test_setting(self, attr_name: str):
+        mode_suffix = "FAST" if self._is_fast_model_test_mode() else "SLOW"
+        preferred = f"{attr_name}_{mode_suffix}"
+        value = getattr(self, preferred, None)
+        if value is not None:
+            return value
+        return getattr(self, attr_name, None)
+
     def _legacy_metric_ceil_pct(self) -> float:
         if self._is_fast_model_test_mode():
             return 1.0
@@ -1499,7 +1513,7 @@ def _build_quantize_config(self):
             dynamic=self.DYNAMIC,
             hessian=HessianConfig(chunk_size=self.HESSIAN_CHUNK_SIZE),
             moe=self.MOE_CONFIG,
-            offload_to_disk=self.OFFLOAD_TO_DISK,
+            offload_to_disk=self._mode_specific_test_setting("OFFLOAD_TO_DISK"),
         )
 
     def quantModel(self, model_id_or_path, trust_remote_code=False, dtype="auto", need_eval=True, batch_size: int = QUANT_BATCH_SIZE, call_perform_post_quant_validation: bool = True, **kwargs):
@@ -1551,9 +1565,21 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, dtype="auto", ne
 
         self._apply_model_compat_quant_overrides(model)
 
+        dataset_size = self._mode_specific_test_setting("DATASET_SIZE")
+        dataset_concat_size = self._mode_specific_test_setting("DATASET_CONCAT_SIZE")
+        log.info(
+            "Calibration dataset config: size=%s, concat_size=%s",
+            dataset_size,
+            dataset_concat_size,
+        )
+
         is_image_to_text_model = MODALITY.IMAGE_TO_TEXT in model.modality
         if quantize_config.requires_calibration_dataset():
-            calibration_dataset = get_calib_dataset(model) if is_image_to_text_model else self.load_dataset(tokenizer, self.DATASET_SIZE)
+            calibration_dataset = (
+                get_calib_dataset(model)
+                if is_image_to_text_model
+                else self.load_dataset(tokenizer, dataset_size)
+            )
         else:
             calibration_dataset = None
 
@@ -1577,7 +1603,7 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, dtype="auto", ne
                 log.info(f"Quantized model artifacts will be saved to: {planned_save_path}")
                 model.quantize(
                     calibration_dataset,
-                    calibration_concat_size=self.DATASET_CONCAT_SIZE,
+                    calibration_concat_size=dataset_concat_size,
                     calibration_concat_separator=self.DATASET_CONCAT_SEPARATOR,
                     calibration_sort=self.DATASET_SORT,
                     backend=self.QUANT_BACKEND,
diff --git a/tests/models/test_brumby.py b/tests/models/test_brumby.py
@@ -13,6 +13,12 @@
 class TestBrumby(ModelTest):
     GROUP_SIZE = 32
     DATASET_SIZE = 1024
+    DATASET_SIZE_FAST = 128
+    # Brumby decoder layers are structurally uniform, so fast mode can quantize
+    # the first layers and avoid replaying 38 untouched layers during calibration.
+    MODEL_COMPAT_FAST_LAYER_COUNT = 1
+    MODEL_COMPAT_FAST_LAYER_POSITION = "first"
+    OFFLOAD_TO_DISK_FAST = False
     NATIVE_MODEL_ID = "/monster/data/model/Brumby-14B-Base"
     TRUST_REMOTE_CODE = True
     LOAD_MODEL_EXTRA_ARGS = {"use_cache": False}
@@ -41,7 +47,13 @@ class TestBrumby(ModelTest):
             "acc": {"value": 0.71, "floor_pct": 0.05, "ceil_pct": 0.10},
         },
     }
-    EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
+    EVAL_TASKS_FAST = {
+        "arc_challenge": {
+            "evalution_batch_size": 8,
+            "evalution_suite_kwargs": {"max_rows": 32},
+            "acc": {"value": 0.89, "floor_pct": 0.10, "ceil_pct": 1.0},
+        },
+    }
 
     @classmethod
     def setUpClass(cls):
diff --git a/tests/models/test_model_test_fast_mode.py b/tests/models/test_model_test_fast_mode.py
@@ -15,6 +15,19 @@ def runTest(self):
         return None
 
 
+class _DatasetCompatCase(ModelTest):
+    __test__ = False
+    DATASET_SIZE = 512
+    DATASET_SIZE_FAST = 128
+    DATASET_CONCAT_SIZE = 2048
+    DATASET_CONCAT_SIZE_FAST = 1024
+    OFFLOAD_TO_DISK = True
+    OFFLOAD_TO_DISK_FAST = False
+
+    def runTest(self):
+        return None
+
+
 class _FakeQuantModel:
     def __init__(self, layer_count: int):
         self.model = SimpleNamespace(layers=nn.ModuleList([nn.Linear(1, 1) for _ in range(layer_count)]))
@@ -55,3 +68,21 @@ def test_model_test_fast_mode_first_layers_remain_configurable(monkeypatch):
         "-:^layers\\.4\\.",
         "-:^layers\\.5\\.",
     ]
+
+
+def test_model_test_fast_mode_uses_fast_dataset_overrides(monkeypatch):
+    monkeypatch.setenv("GPTQMODEL_MODEL_TEST_MODE", "fast")
+    case = _DatasetCompatCase(methodName="runTest")
+
+    assert case._mode_specific_test_setting("DATASET_SIZE") == 128
+    assert case._mode_specific_test_setting("DATASET_CONCAT_SIZE") == 1024
+    assert case._mode_specific_test_setting("OFFLOAD_TO_DISK") is False
+
+
+def test_model_test_slow_mode_uses_default_dataset_settings(monkeypatch):
+    monkeypatch.setenv("GPTQMODEL_MODEL_TEST_MODE", "slow")
+    case = _DatasetCompatCase(methodName="runTest")
+
+    assert case._mode_specific_test_setting("DATASET_SIZE") == 512
+    assert case._mode_specific_test_setting("DATASET_CONCAT_SIZE") == 2048
+    assert case._mode_specific_test_setting("OFFLOAD_TO_DISK") is True
diff --git a/tests/test_hf_utils.py b/tests/test_hf_utils.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
+import tempfile
+
 from torch import nn
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import _get_tied_weight_keys
@@ -46,3 +48,14 @@ def test_legacy_list_tied_weights_are_normalized_to_input_embeddings():
     }
     assert model._tied_weights_keys == {"lm_head.weight": "embed_tokens.weight"}
     assert _get_tied_weight_keys(model) == ["lm_head.weight"]
+
+
+def test_legacy_list_tied_weights_allow_save_pretrained():
+    model = _LegacyTiedWeightsModel(_DummyConfig())
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        model._tied_weights_keys = ["lm_head.weight"]
+        model.get_expanded_tied_weights_keys(all_submodels=False)
+        model._tied_weights_keys = ["lm_head.weight"]
+        _hf_utils._normalize_legacy_tied_weights_keys(model)
+        model.save_pretrained(tmp_dir, state_dict={}, is_main_process=True)
diff --git a/tests/test_marlin_jit.py b/tests/test_marlin_jit.py
@@ -268,7 +268,7 @@ def test_sm75_turing_contract_is_present_in_marlin_sources():
 
 
 def test_stage2_dense_four_bit_tiles_stay_in_sync_between_selector_and_codegen():
-    marlin_root = marlin_utils._marlin_root()
+    marlin_root = marlin_utils._ensure_generated_marlin_kernels()
     gemm_cu = (marlin_root / "gptq_marlin.cu").read_text(encoding="utf-8")
     generator_py = (marlin_root / "generate_kernels.py").read_text(encoding="utf-8")
     kernel_u4 = (marlin_root / "kernel_fp16_ku4.cu").read_text(encoding="utf-8")
diff --git a/tests/test_threadx.py b/tests/test_threadx.py