Skip to content

Commit 455bcad

Browse files
TimDettmersclaude
andcommitted
Fix matmul_4bit out parameter not writing to output tensor (#1235)
The `out` kwarg in `matmul_4bit()` was accepted but ignored in the `MatMul4Bit.forward()` path (2D+ inputs). The computed result was returned as a new tensor without being copied into `out`. Added `out.copy_(output)` after computing the linear result so the caller's pre-allocated tensor is populated as expected. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4ba0ccd commit 455bcad

File tree

2 files changed

+48
-1
lines changed

2 files changed

+48
-1
lines changed

bitsandbytes/autograd/_functions.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,12 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
314314
# 2. MatmulnN
315315
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
316316

317-
# 3. Save state
317+
# 3. Write to out tensor if provided
318+
if out is not None:
319+
out.copy_(output)
320+
output = out
321+
322+
# 4. Save state
318323
ctx.state = quant_state
319324
ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
320325

tests/test_autograd.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,3 +262,45 @@ def test_matmul_4bit(
262262

263263
if req_grad[2]:
264264
torch.testing.assert_close(gradBias1, gradBias2)
265+
266+
267+
@pytest.mark.parametrize("device", get_available_devices())
268+
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"], ids=id_formatter("quant_type"))
269+
@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
270+
@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
271+
def test_matmul_4bit_out_parameter(device, quant_type, dtype, has_bias):
272+
"""Test that matmul_4bit(A, B, out=output) writes the result into output (issue #1235)."""
273+
M, K, N = 32, 64, 48
274+
275+
# Create weight matrix (K, N) and quantize — matmul_4bit computes A @ dequant(B)
276+
W = torch.randn(K, N, device=device, dtype=dtype)
277+
torch.nn.init.xavier_uniform_(W)
278+
B_quant, quant_state = bnb.functional.quantize_4bit(W, quant_type=quant_type)
279+
280+
bias = None
281+
if has_bias:
282+
bias = torch.randn(N, device=device, dtype=dtype)
283+
284+
# --- Test 2D input (matrix path through MatMul4Bit) ---
285+
A_2d = torch.randn(M, K, device=device, dtype=dtype)
286+
expected = bnb.matmul_4bit(A_2d, B_quant, quant_state, bias=bias)
287+
288+
out_2d = torch.zeros(M, N, device=device, dtype=dtype)
289+
returned = bnb.matmul_4bit(A_2d, B_quant, quant_state, out=out_2d, bias=bias)
290+
291+
# out tensor should contain the result
292+
torch.testing.assert_close(out_2d, expected)
293+
# returned value should be the same object as out
294+
assert returned.data_ptr() == out_2d.data_ptr(), "returned tensor should share storage with out"
295+
296+
# --- Test 1D input (gemv path) if on CUDA and blocksize divides K ---
297+
# Skip bias for 1D: the gemv path has a pre-existing shape bug with bias when K != N.
298+
if device == "cuda" and K % quant_state.blocksize == 0 and not has_bias:
299+
A_1d = torch.randn(K, device=device, dtype=dtype)
300+
expected_1d = bnb.matmul_4bit(A_1d, B_quant, quant_state)
301+
302+
out_1d = torch.zeros_like(expected_1d)
303+
returned_1d = bnb.matmul_4bit(A_1d, B_quant, quant_state, out=out_1d)
304+
305+
torch.testing.assert_close(out_1d, expected_1d)
306+
assert returned_1d.data_ptr() == out_1d.data_ptr(), "returned tensor should share storage with out"

0 commit comments

Comments
 (0)