Wire quant_predicate for mixed-precision quantization

Blaizzy · claude · Blaizzy · commit 0b94ef969367 · 2026-04-22T23:27:13.000+02:00
Add a quant_predicate on the privacy-filter Model that keeps the MoE
router at 8 bits while the rest of the weights quantize to the user's
chosen bit width. The router is a small but routing-sensitive linear;
a uniform 4-bit quantization of the router was measurably degrading
accuracy in gpt-oss-style models, and the same applies here.

Follow mlx-vlm's pattern in convert.py: delegate quantization to
mlx_lm.utils.quantize_model, passing a wrapper that composes
mlx-embeddings' skip-vision / group-size checks with the model's
quant_predicate. mlx_lm handles recording per-layer overrides into
config["quantization"][path], and the existing load path in utils.py
already respects those.

Verified: bf16 and q4 (uniform) both still extract the same PII spans;
mixed-precision q4-experts + q8-router saves to disk with 4.52 bits/
weight, loads correctly, and extracts the same spans.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlx_embeddings/convert.py b/mlx_embeddings/convert.py
@@ -91,23 +91,34 @@ def defaults_for_mode(mode: str, group_size: int, bits: int) -> Tuple[int, int]:
         effective_bits = bits if bits else default_bits
         return effective_group_size, effective_bits
 
-    quantized_config = copy.deepcopy(config)
     effective_group_size, effective_bits = defaults_for_mode(mode, q_group_size, q_bits)
 
-    nn.quantize(
+    # Delegate to mlx_lm.utils.quantize_model (same pattern as mlx-vlm): it reads
+    # `model.quant_predicate` and records per-layer overrides into the config,
+    # while our wrapper adds the skip-vision / group-size sanity checks.
+    from mlx_lm.utils import quantize_model as mlx_lm_quantize_model
+
+    default_predicate = get_class_predicate(
+        skip_vision=skip_vision, q_group_size=effective_group_size
+    )
+    model_predicate = getattr(model, "quant_predicate", None)
+
+    def quant_predicate(path, module):
+        if not default_predicate(path, module):
+            return False
+        if model_predicate is not None:
+            return model_predicate(path, module)
+        return True
+
+    model, quantized_config = mlx_lm_quantize_model(
         model,
+        copy.deepcopy(config),
         group_size=effective_group_size,
         bits=effective_bits,
         mode=mode,
-        class_predicate=get_class_predicate(
-            skip_vision=skip_vision, q_group_size=effective_group_size
-        ),
+        quant_predicate=quant_predicate,
     )
-    quantized_config["quantization"] = {
-        "group_size": effective_group_size,
-        "bits": effective_bits,
-        "mode": mode,
-    }
+
     if "vision_config" in quantized_config and isinstance(
         quantized_config["vision_config"], dict
     ):
diff --git a/mlx_embeddings/models/openai_privacy_filter.py b/mlx_embeddings/models/openai_privacy_filter.py
@@ -155,8 +155,6 @@ def __init__(self, config: ModelArgs):
         )
 
     def __call__(self, x: mx.array) -> mx.array:
-        # Go through the router module so this works with both dense and
-        # QuantizedLinear weights; upcast the softmax for numerical parity.
         router_logits = self.router(x).astype(mx.float32)
 
         k = self.num_experts_per_tok
@@ -315,3 +313,12 @@ def sanitize(self, weights: dict) -> dict:
     @property
     def layers(self):
         return self.model.layers
+
+    @property
+    def quant_predicate(self):
+        def predicate(path, _):
+            if path.endswith("router"):
+                return {"group_size": 64, "bits": 8}
+            return True
+
+        return predicate