Tencent
diff --git a/‎angelslim/compressor/quant/core/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎angelslim/compressor/quant/core/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎angelslim/compressor/quant/core/packing_utils.py‎
Lines changed: 19 additions & 0 deletions b/‎angelslim/compressor/quant/core/packing_utils.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎angelslim/compressor/quant/modules/helper_layer.py‎
Lines changed: 27 additions & 5 deletions b/‎angelslim/compressor/quant/modules/helper_layer.py‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎angelslim/compressor/quant/ptq.py‎
Lines changed: 2 additions & 0 deletions b/‎angelslim/compressor/quant/ptq.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎angelslim/compressor/transform/factory.py‎
Lines changed: 3 additions & 0 deletions b/‎angelslim/compressor/transform/factory.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎angelslim/compressor/transform/rotation/hadamard_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎angelslim/compressor/transform/rotation/hadamard_utils.py‎
Lines changed: 1 addition & 1 deletion
@@ -19,6 +19,7 @@
     dequantize_gemm,
     pack_weight_to_int8,
     pack_weight_to_int8_gpu,
+    unpack_weight_omni,
 )
 from .quant_func import *  # noqa: F401 F403
 from .sample_func import EMASampler, MultiStepSampler  # noqa: F401
 
@@ -40,6 +40,25 @@ def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
     return iweights, izeros
 
 
+def unpack_weight_omni(qweight: torch.Tensor, save_bit: int = 4, pack_bit: int = 8):
+    assert pack_bit % save_bit == 0, "pack_bit must be divisible by save_bit"
+    mask = (1 << save_bit) - 1  # e.g. 0x0F for 4-bit
+    sign_bit = 1 << (save_bit - 1)  # e.g. 0x08 for 4-bit
+    shifts = torch.arange(0, pack_bit, save_bit, device=qweight.device)
+    qweight = qweight.to(torch.int32)
+    # Extract each sub-value and apply sign extension
+    # bitwise_right_shift is arithmetic, so the highest slot (last shift) is already
+    # sign-extended correctly; all other slots need masking + manual sign extension.
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int32
+    )
+    # Mask off upper bits and sign-extend for all slots except the topmost
+    iweights = iweights & mask  # isolate save_bit bits
+    iweights = iweights - ((iweights & sign_bit) << 1)  # sign extend
+    iweights = iweights.reshape(iweights.shape[0], -1)
+    return iweights
+
+
 def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
     reverse_order_tensor = torch.arange(
         iweights.shape[-1],
 
@@ -34,6 +34,7 @@
     reduce_block_padding,
     tensor_quant_dequant_fp8,
     tensor_quant_dequant_int,
+    unpack_weight_omni,
 )
 
 
@@ -547,19 +548,21 @@ def __init__(
         super().__init__()
         self.quant_algo = quant_algo
         weight_scale = weight_scale.to(weight.device)
+        self.group_size = group_size
         if "fp8" in quant_algo:
             if "w4a8" in self.quant_algo:
                 max_value_group_wise = weight_scale.clone()
+                # weight(bf16) -> fp8 -> int4
+                # dweight(int4) -> fp8 -> weight(bf16)
                 tensor_wise_scale = max_value_group_wise.max() / 448.0
-                quant_weight, _ = quantize_weight_per_tensor_fp8(weight, tensor_wise_scale)
-                new_weight_bf16 = quant_weight.to(torch.bfloat16) * tensor_wise_scale
-
+                new_weight_bf16 = weight
                 new_weight_bf16_qdq = fake_quant_dequant(
                     new_weight_bf16, method="groupwise", bits=4, group_size=group_size
                 )
                 quant_weight, _ = quantize_weight_int(
                     new_weight_bf16_qdq, max_value_group_wise, bits=4
                 )
+
                 quant_weight = pack_weight_to_int8(quant_weight)
                 del new_weight_bf16_qdq, new_weight_bf16
                 self.weight_scale_int4 = torch.nn.Parameter(
@@ -600,11 +603,30 @@ def forward(self, x):
                 raise ValueError(f"Unsupported quantization algorithm: {self.quant_algo}")
 
         if "fp8" in self.quant_algo:
+            if "w4a8" in self.quant_algo:
+                # unpack, save as int32
+                weight = self.qweight.to(qinput.device)
+                weight = unpack_weight_omni(weight, save_bit=4, pack_bit=8)
+                weight_scale = self.weight_scale.to(qinput.device)
+
+                scale = (
+                    self.weight_scale_int4.float()
+                    .repeat_interleave(self.group_size, dim=-1)
+                    .to(qinput.device)
+                )  # (out,in)
+                # dequant to bf16
+                weight = weight * scale
+                # quant to fp8
+                weight, _ = quantize_weight_per_tensor_fp8(weight, weight_scale)
+                # to fp8
+            else:
+                weight = self.weight.to(qinput.device)
+                weight_scale = self.weight_scale.to(qinput.device)
             output = gemm_fp8(
                 act=qinput,
                 act_scale=self.input_scale,
-                weight=self.weight,
-                weight_scale=self.weight_scale,
+                weight=weight,
+                weight_scale=weight_scale,
                 bias=self.bias,
                 out_dtype=x.dtype,
             )
 
@@ -162,6 +162,8 @@ def convert(self):
             if "smooth" in self.quant_helpers:
                 self.smooth.convert()
             self._convert()
+
+        self.transform_runner.convert()
         print_info("convert model done.")
 
     def save(self, save_path: str):
 
@@ -31,6 +31,9 @@ def run(self):
     def save(self):
         pass
 
+    def convert(self):
+        pass
+
 
 class TransformFactory:
     """Factory for creating TransformBase instances from config.
 
@@ -101,7 +101,7 @@ def matmul_hadUt(X):
 def random_hadamard_matrix(size, device):
     # See https://cornell-relaxml.github.io/quip-sharp/ ,
     # Section "Randomized Hadamard Transformation"
-    Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64)
+    Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float32)
     Q = Q * 2 - 1
     Q = torch.diag(Q)
     return matmul_hadU(Q).to(device)
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`dequantize_gemm,`
`20`	`20`	`pack_weight_to_int8,`
`21`	`21`	`pack_weight_to_int8_gpu,`
	`22`	`+ unpack_weight_omni,`
`22`	`23`	`)`
`23`	`24`	`from .quant_func import * # noqa: F401 F403`
`24`	`25`	`from .sample_func import EMASampler, MultiStepSampler # noqa: F401`