fix: use alpha/rank scaling in LoRaLayer (standard LoRA convention) (#846)

kikoncuo · claude · Blaizzy · web-flow · commit 6b0ad8f06b9e · 2026-04-18T14:08:12.000+02:00
* fix: use alpha/rank scaling in LoRaLayer (standard LoRA convention) LoRaLayer used raw `alpha` as the scaling factor instead of `alpha / rank`. With the default alpha=16, rank=8, this made the LoRA contribution 8x larger than PEFT, the original LoRA paper, and mlx-lm. Before: scale = alpha = 16.0 After: scale = alpha / rank = 2.0 Also fixes replace_lora_with_linear to use the same corrected scale. Added tests verifying: - scale = alpha / rank - Forward pass produces (alpha/rank) * (x @ A @ B) - Default settings give 2x scaling, not 16x Fixes #845 * test: add B=0 initialization test for LoRaLayer Verifies that when B is zeros (default init), the LoRA layer output equals the base linear layer output exactly (no LoRA contribution). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor: move LoRA scaling tests into test_trainer.py Move TestLoRaScaling class from test_trainer_utils.py into test_trainer.py as suggested in review, and revert test_trainer_utils.py to its original state. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Prince Canuma <prince.gdt@gmail.com>
diff --git a/mlx_vlm/tests/test_trainer.py b/mlx_vlm/tests/test_trainer.py
@@ -5,6 +5,7 @@
 import mlx.nn as nn
 
 from mlx_vlm.trainer.datasets import VisionDataset
+from mlx_vlm.trainer.lora import LoRaLayer
 from mlx_vlm.trainer.sft_trainer import TrainingArgs, train
 
 
@@ -142,6 +143,70 @@ def test_train_smoke(self, mock_save_safetensors, mock_iterate_batches):
         mock_save_safetensors.assert_called()
 
 
+class TestLoRaScaling(unittest.TestCase):
+    """Verify LoRaLayer uses alpha/rank scaling (standard LoRA convention)."""
+
+    def test_scale_is_alpha_over_rank(self):
+        linear = nn.Linear(4, 4)
+        lora = LoRaLayer(linear, rank=8, alpha=16.0)
+        self.assertAlmostEqual(lora.scale, 2.0)  # 16 / 8 = 2.0
+
+    def test_scale_with_rank_equals_alpha(self):
+        linear = nn.Linear(4, 4)
+        lora = LoRaLayer(linear, rank=4, alpha=4.0)
+        self.assertAlmostEqual(lora.scale, 1.0)  # 4 / 4 = 1.0
+
+    def test_forward_scaling_matches_peft(self):
+        """LoRA contribution should equal (alpha/rank) * (x @ A @ B)."""
+        linear = nn.Linear(4, 4)
+        lora = LoRaLayer(linear, rank=8, alpha=16.0, dropout=0.0)
+
+        # Set deterministic weights
+        lora.A = mx.ones((4, 8))
+        lora.B = mx.ones((8, 4))
+        x = mx.ones((1, 4))
+
+        base_output = linear(x)
+        actual_output = lora(x)
+        lora_contribution = actual_output - base_output
+
+        # Expected: (alpha / rank) * (x @ A @ B) = 2.0 * (ones(1,4) @ ones(4,8) @ ones(8,4))
+        # x @ A = 4 * ones(1,8), then @ B = 32 * ones(1,4), then * 2.0 = 64
+        expected_per_element = 2.0 * 4.0 * 8.0  # 64.0
+        self.assertAlmostEqual(
+            lora_contribution[0, 0].item(), expected_per_element, places=1
+        )
+
+    def test_b_zero_init_gives_no_lora_contribution(self):
+        """When B is zeros (default init), output should equal base linear."""
+        linear = nn.Linear(4, 4)
+        lora = LoRaLayer(linear, rank=8, alpha=16.0, dropout=0.0)
+        # B is already zeros from __init__, don't override it
+        x = mx.ones((1, 4))
+        base_output = linear(x)
+        lora_output = lora(x)
+        self.assertTrue(mx.allclose(base_output, lora_output).item())
+
+    def test_default_alpha_rank_gives_2x(self):
+        """Default alpha=16, rank=8 should give 2x scaling, not 16x."""
+        linear = nn.Linear(8, 8)
+        lora = LoRaLayer(linear, rank=8, alpha=16.0, dropout=0.0)
+
+        lora.A = mx.ones((8, 8))
+        lora.B = mx.ones((8, 8))
+        x = mx.ones((1, 8))
+
+        base = linear(x)
+        actual = lora(x)
+        contribution = (actual - base)[0, 0].item()
+
+        raw_delta = (x @ lora.A @ lora.B)[0, 0].item()  # 64.0
+
+        # Should be 2x the raw delta, not 16x
+        self.assertAlmostEqual(contribution, 2.0 * raw_delta, places=1)
+        self.assertNotAlmostEqual(contribution, 16.0 * raw_delta, places=1)
+
+
 if __name__ == "__main__":
     unittest.main()
 
diff --git a/mlx_vlm/trainer/lora.py b/mlx_vlm/trainer/lora.py
@@ -31,19 +31,19 @@ def __init__(
             shape=(input_dims, rank),
         )
         self.B = mx.zeros((rank, output_dims))
-        self.alpha = alpha
+        self.scale = alpha / rank
 
     def __call__(self, x):
         y = self.original_layer(x)
         lora_update = (self.dropout(x) @ self.A) @ self.B
-        return y + (self.alpha * lora_update).astype(x.dtype)
+        return y + (self.scale * lora_update).astype(x.dtype)
 
 
 def replace_lora_with_linear(model):
     for i, layer in enumerate(model.layers):
         if isinstance(layer, LoRaLayer):
             # Compute the final merged weight
-            lora_update = layer.alpha * (layer.A @ layer.B)
+            lora_update = layer.scale * (layer.A @ layer.B)
             updated_weight = layer.original_layer.weight + lora_update
             use_bias = layer.original_layer.bias is not None