Skip to content

Commit 9985e28

Browse files
authored
feat: add end-to-end vLLM W4A8+FP8 mixed quantization pipeline (#255)
1 parent 10101f4 commit 9985e28

9 files changed

Lines changed: 1652 additions & 262 deletions

File tree

angelslim/compressor/quant/core/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@
1515
from .config import * # noqa: F401 F403
1616
from .hook import PTQHook # noqa: F401
1717
from .metrics import LossFilter, mse_loss, snr_loss # noqa: F401
18-
from .packing_utils import dequantize_gemm, pack_weight_to_int8 # noqa: F401
18+
from .packing_utils import ( # noqa: F401
19+
dequantize_gemm,
20+
pack_weight_to_int8,
21+
pack_weight_to_int8_gpu,
22+
)
1923
from .quant_func import * # noqa: F401 F403
2024
from .sample_func import EMASampler, MultiStepSampler # noqa: F401
2125
from .save import DeepSeekV3PTQSaveMulti # noqa: F401

angelslim/compressor/quant/core/packing_utils.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
109109

110110

111111
def pack_weight_to_int8(weight):
112+
"""Pack two INT4 values into one INT8 byte (CPU, numpy-based).
113+
114+
Original implementation using Python loops for packing.
115+
Kept for debugging and fallback.
116+
For GPU-accelerated packing, use pack_weight_to_int8_gpu.
117+
118+
Args:
119+
weight: Tensor of shape (out_features, in_features) with values in [-8, 7].
120+
121+
Returns:
122+
Packed INT8 tensor of shape (out_features, in_features // 2) on CPU.
123+
"""
112124
weight = weight.t().contiguous().cpu()
113125
weight = weight.to(torch.float32).numpy().astype(np.int8)
114126

@@ -124,3 +136,34 @@ def pack_weight_to_int8(weight):
124136
packed_weight = packed_weight.astype(np.int8)
125137
packed_weight = torch.from_numpy(packed_weight).t().contiguous()
126138
return packed_weight
139+
140+
141+
def pack_weight_to_int8_gpu(weight):
142+
"""Pack two INT4 values into one INT8 byte using pure PyTorch (GPU-accelerated).
143+
144+
Supports both CPU and GPU tensors — no numpy dependency, so packing
145+
can be done directly on GPU without device transfer overhead.
146+
147+
Input layout (after transpose): rows are paired (row 0,1 -> packed row 0, etc.)
148+
Low nibble = even row, high nibble = odd row.
149+
150+
Args:
151+
weight: Tensor of shape (out_features, in_features) with values in [-8, 7].
152+
Can be on any device (CPU or CUDA).
153+
154+
Returns:
155+
Packed INT8 tensor of shape (out_features, in_features // 2),
156+
on the same device as input.
157+
"""
158+
# Transpose to (in_features, out_features) for row-pair packing
159+
weight = weight.t().contiguous().to(torch.int8)
160+
161+
# Vectorized packing: pair adjacent rows and combine low/high nibbles
162+
# Even rows -> low nibble, odd rows -> high nibble
163+
even_rows = weight[0::2] # shape: (rows//2, cols)
164+
odd_rows = weight[1::2] # shape: (rows//2, cols)
165+
packed_weight = (even_rows & 0x0F) | ((odd_rows & 0x0F) << 4)
166+
167+
# Transpose back to (out_features, in_features // 2)
168+
packed_weight = packed_weight.t().contiguous()
169+
return packed_weight

0 commit comments

Comments
 (0)