Use mlx-vlm-style predicate chaining for quantization

Blaizzy · claude · Blaizzy · commit 18d109c1dea9 · 2026-04-22T23:33:26.000+02:00
Keep the local nn.quantize call but switch the class_predicate to the
compose-with-model.quant_predicate pattern from mlx-vlm: chain the
default skip-vision / group-size predicate with the model's own
predicate, and record any per-layer dict results so the load path
re-quantizes the same way.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlx_embeddings/convert.py b/mlx_embeddings/convert.py
@@ -91,34 +91,41 @@ def defaults_for_mode(mode: str, group_size: int, bits: int) -> Tuple[int, int]:
         effective_bits = bits if bits else default_bits
         return effective_group_size, effective_bits
 
+    quantized_config = copy.deepcopy(config)
     effective_group_size, effective_bits = defaults_for_mode(mode, q_group_size, q_bits)
 
-    # Delegate to mlx_lm.utils.quantize_model (same pattern as mlx-vlm): it reads
-    # `model.quant_predicate` and records per-layer overrides into the config,
-    # while our wrapper adds the skip-vision / group-size sanity checks.
-    from mlx_lm.utils import quantize_model as mlx_lm_quantize_model
-
+    # Predicate-chaining pattern from mlx-vlm: honor the model's `quant_predicate`
+    # (if any) on top of the default skip-vision / group-size checks, and record
+    # per-layer overrides so the load path re-quantizes the same way.
     default_predicate = get_class_predicate(
         skip_vision=skip_vision, q_group_size=effective_group_size
     )
-    model_predicate = getattr(model, "quant_predicate", None)
+    model_quant_predicate = getattr(model, "quant_predicate", None)
+    overrides: Dict[str, Dict[str, int]] = {}
 
-    def quant_predicate(path, module):
+    def base_quant_predicate(path, module):
         if not default_predicate(path, module):
             return False
-        if model_predicate is not None:
-            return model_predicate(path, module)
-        return True
-
-    model, quantized_config = mlx_lm_quantize_model(
+        if model_quant_predicate is None:
+            return True
+        result = model_quant_predicate(path, module)
+        if isinstance(result, dict):
+            overrides[path] = result
+        return result
+
+    nn.quantize(
         model,
-        copy.deepcopy(config),
         group_size=effective_group_size,
         bits=effective_bits,
         mode=mode,
-        quant_predicate=quant_predicate,
+        class_predicate=base_quant_predicate,
     )
-
+    quantized_config["quantization"] = {
+        "group_size": effective_group_size,
+        "bits": effective_bits,
+        "mode": mode,
+        **overrides,
+    }
     if "vision_config" in quantized_config and isinstance(
         quantized_config["vision_config"], dict
     ):