feat: support SwiGLU

RayZhao1998 · RayZhao1998 · commit 9e07cb05ecf4 · 2025-11-13T22:43:54.000+08:00
diff --git a/cs336_basics/embedding.py b/cs336_basics/embedding.py
@@ -6,7 +6,7 @@ def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
         self.weights = torch.nn.Parameter(torch.empty(num_embeddings, embedding_dim))
-        self.weights = torch.nn.init.trunc_normal_(self.weights, mean=0.0, std=1.0, a=-3.0, b=3.0)
+        torch.nn.init.trunc_normal_(self.weights, mean=0.0, std=1.0, a=-3.0, b=3.0)
         self.device = device
         self.dtype = dtype
 
diff --git a/cs336_basics/linear.py b/cs336_basics/linear.py
@@ -7,7 +7,7 @@ def __init__(self, in_features, out_features, device=None, dtype=None):
         self.out_features = out_features
         self.weights = torch.nn.Parameter(torch.empty(out_features, in_features, device=device, dtype=dtype))
         std = (2.0 / (in_features + out_features)) ** 0.5
-        self.weights = torch.nn.init.trunc_normal_(self.weights, mean=0.0, std=std, a=-3*std, b=3*std)
+        torch.nn.init.trunc_normal_(self.weights, mean=0.0, std=std, a=-3*std, b=3*std)
         self.device = device
         self.dtype = dtype
 
diff --git a/cs336_basics/swiglu.py b/cs336_basics/swiglu.py
@@ -0,0 +1,27 @@
+import torch
+
+class SwiGLU(torch.nn.Module):
+    def __init__(self, d_model: int, d_ff= None, device=None, dtype=None):
+        super().__init__()
+        self.d_model = d_model
+        if d_ff is None:
+            d_ff_raw = int((8/3) * d_model)
+            d_ff = ((d_ff_raw + 32) // 64) * 64
+        self.d_ff = d_ff
+
+        self.w1 = torch.nn.Parameter(torch.empty((self.d_ff, self.d_model), device=device, dtype=dtype))
+        self.w3 = torch.nn.Parameter(torch.empty((self.d_ff, self.d_model), device=device, dtype=dtype))
+        self.w2 = torch.nn.Parameter(torch.empty((self.d_model, self.d_ff), device=device, dtype=dtype))
+        std_w1_w3 = (2.0 / (self.d_model + self.d_ff)) ** 0.5
+        std_w2 = (2.0 / (self.d_ff + self.d_model)) ** 0.5
+        torch.nn.init.trunc_normal_(self.w1, mean=0.0, std=std_w1_w3, a=-3*std_w1_w3, b=3*std_w1_w3)
+        torch.nn.init.trunc_normal_(self.w3, mean=0.0, std=std_w1_w3, a=-3*std_w1_w3, b=3*std_w1_w3)
+        torch.nn.init.trunc_normal_(self.w2, mean=0.0, std=std_w2, a=-3*std_w2, b=3*std_w2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        w1x = x @ self.w1.T
+        w3x = x @ self.w3.T
+        silu_w1x = w1x * torch.sigmoid(w1x)
+        gated = silu_w1x * w3x
+        output = gated @ self.w2.T
+        return output
diff --git a/tests/adapters.py b/tests/adapters.py
@@ -14,6 +14,7 @@
 from cs336_basics.linear import Linear
 from cs336_basics.embedding import Embedding
 from cs336_basics.rmsnorm import RMSNorm
+from cs336_basics.swiglu import SwiGLU
 
 
 def run_linear(
@@ -91,7 +92,9 @@ def run_swiglu(
     # swiglu.w1.weight.data = w1_weight
     # swiglu.w2.weight.data = w2_weight
     # swiglu.w3.weight.data = w3_weight
-    raise NotImplementedError
+    swiglu = SwiGLU(d_model, d_ff=d_ff)
+    swiglu.load_state_dict({ 'w1': w1_weight, 'w2': w2_weight, 'w3': w3_weight })
+    return swiglu.forward(in_features)
 
 
 def run_scaled_dot_product_attention(