-
-
Notifications
You must be signed in to change notification settings - Fork 844
Expand file tree
/
Copy pathtest_triton.py
More file actions
64 lines (50 loc) · 2.55 KB
/
test_triton.py
File metadata and controls
64 lines (50 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pytest
import torch
from bitsandbytes.nn import Linear8bitLt
from bitsandbytes.nn.triton_based_modules import SwitchBackLinear
from bitsandbytes.triton.triton_utils import is_triton_available
from tests.helpers import TRUE_FALSE
@pytest.mark.skipif(
not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
)
@pytest.mark.deprecated
@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
def test_switchback(vector_wise_quantization):
for dim in [83]:
for batch in [13]:
standard = torch.nn.Linear(dim, 4 * dim).cuda().half()
switchback = (
SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half()
)
baseline = Linear8bitLt(dim, 4 * dim).cuda().half()
switchback.weight.data.copy_(standard.weight)
switchback.bias.data.copy_(standard.bias)
baseline.weight.data.copy_(standard.weight)
baseline.bias.data.copy_(standard.bias)
x1 = torch.randn(batch, dim).cuda().half().requires_grad_(True)
x2 = x1.clone().detach().requires_grad_(True)
x3 = x1.clone().detach().requires_grad_(True)
out_standard = standard(x1)
(2**10 * out_standard.abs().mean()).backward()
print(x2.dtype)
out_sb = switchback(x2)
(2**10 * out_sb.abs().mean()).backward()
out_baseline = baseline(x3)
(2**10 * out_baseline.abs().mean()).backward()
err_sb = (out_standard - out_sb).abs().mean()
err_baseline = (out_standard - out_baseline).abs().mean()
print("OUT", err_sb, err_baseline)
assert err_sb < 2 * err_baseline
err_sb = (standard.bias.grad - switchback.bias.grad).abs().mean()
err_baseline = (standard.bias.grad - baseline.bias.grad).abs().mean()
print("GW2", err_sb, err_baseline)
assert err_sb < 2 * err_baseline
err_sb = (standard.weight.grad - switchback.weight.grad).abs().mean()
err_baseline = (standard.weight.grad - baseline.weight.grad).abs().mean()
print("GW1", err_sb, err_baseline)
assert err_sb < 2 * err_baseline
err_sb = (x1.grad - x2.grad).abs().mean()
err_baseline = (x1.grad - x3.grad).abs().mean()
print("GX1", err_sb, err_baseline)
assert err_sb < 2 * err_baseline