|
3 | 3 | #include "infinicore/ops.hpp" |
4 | 4 | #include "infinicore/ops/distributed/allreduce.hpp" |
5 | 5 | #include "infinicore/ops/linear.hpp" |
| 6 | +#include "infinicore/ops/linear_w4a16_awq.hpp" |
6 | 7 | #include "infinicore/ops/linear_w8a8i8.hpp" |
7 | 8 | #include <optional> |
8 | 9 | #include <spdlog/spdlog.h> |
@@ -43,6 +44,15 @@ Tensor BaseLinear::compute_linear(Tensor &input) const { |
43 | 44 | auto output = infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight_packed_tensor, weight_scale_tensor, bias_opt); |
44 | 45 | return output; |
45 | 46 | } |
| 47 | + case infinicore::quantization::QuantScheme::AWQ_W4A16: { |
| 48 | + Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); |
| 49 | + Tensor qweight = static_cast<const Tensor &>(weight_); |
| 50 | + Tensor qzeros = static_cast<const Tensor &>(weight_zeros_); |
| 51 | + Tensor scales = static_cast<const Tensor &>(weight_scale_); |
| 52 | + std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt; |
| 53 | + auto output = infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt); |
| 54 | + return output; |
| 55 | + } |
46 | 56 | default: { |
47 | 57 | // Ensure input is contiguous before creating views (required for matmul) |
48 | 58 | // This prevents hanging when input tensor has non-contiguous memory layout |
@@ -116,6 +126,20 @@ Linear::Linear(size_t in_features, size_t out_features, |
116 | 126 | } |
117 | 127 | break; |
118 | 128 | } |
| 129 | + case infinicore::quantization::QuantScheme::AWQ_W4A16: { |
| 130 | + weight_ = infinicore::nn::Parameter({out_features, in_features}, infinicore::DataType::I32, device); |
| 131 | + this->register_parameter("qweight", weight_); |
| 132 | + weight_zeros_ = infinicore::nn::Parameter({out_features, in_features}, infinicore::DataType::I32, device); |
| 133 | + this->register_parameter("qzeros", weight_zeros_); |
| 134 | + weight_scale_ = infinicore::nn::Parameter({out_features, in_features}, dtype_, device); |
| 135 | + this->register_parameter("scales", weight_scale_); |
| 136 | + if (bias) { |
| 137 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device)); |
| 138 | + } else { |
| 139 | + bias_ = Parameter(); |
| 140 | + } |
| 141 | + break; |
| 142 | + } |
119 | 143 | default: { |
120 | 144 | // Initialize parameters using macro |
121 | 145 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device)); |
@@ -190,6 +214,39 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur |
190 | 214 | } |
191 | 215 | break; |
192 | 216 | } |
| 217 | + case infinicore::quantization::QuantScheme::AWQ_W4A16: { |
| 218 | + auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(this->quantization_); |
| 219 | + int group_size = awq_ptr->get_group_size(); |
| 220 | + int packing_num = awq_ptr->get_packing_num(); |
| 221 | + |
| 222 | + weight_ = infinicore::nn::Parameter({in_features, out_features / packing_num}, |
| 223 | + infinicore::DataType::I32, |
| 224 | + device, 1, tp_rank_, tp_size_); |
| 225 | + this->register_parameter("qweight", weight_); |
| 226 | + |
| 227 | + // Weight scale: [out_features, in_features / group_size] |
| 228 | + // One FP32 scale per group of weights (group_size=128) |
| 229 | + |
| 230 | + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, |
| 231 | + dtype_, |
| 232 | + device, 1, tp_rank_, tp_size_); |
| 233 | + this->register_parameter("scales", weight_scale_); |
| 234 | + |
| 235 | + // Weight zeros (zero points): [out_features, in_features / group_size] |
| 236 | + // AWQ implementations (e.g., AutoAWQ) typically store zero points as I32 |
| 237 | + // for symmetric/asymmetric quantization support |
| 238 | + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features / packing_num}, |
| 239 | + infinicore::DataType::I32, |
| 240 | + device, 1, tp_rank_, tp_size_); |
| 241 | + |
| 242 | + this->register_parameter("qzeros", weight_zeros_); |
| 243 | + if (bias) { |
| 244 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); |
| 245 | + } else { |
| 246 | + bias_ = Parameter(); |
| 247 | + } |
| 248 | + break; |
| 249 | + } |
193 | 250 | default: { |
194 | 251 | // Initialize parameters using macro |
195 | 252 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, |
@@ -261,6 +318,44 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, st |
261 | 318 | } |
262 | 319 | break; |
263 | 320 | } |
| 321 | + case infinicore::quantization::QuantScheme::AWQ_W4A16: { |
| 322 | + // AWQ W4A16 for RowParallelLinear:切分维度为 in_features(权重矩阵的第1维) |
| 323 | + // - Weight: packed int4 in I32 containers (8 int4 per I32) |
| 324 | + // - Group-wise quantization with group_size=128 |
| 325 | + // - Scale and zero points stored per group along in_features dimension |
| 326 | + |
| 327 | + auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(this->quantization_); |
| 328 | + int group_size = awq_ptr->get_group_size(); |
| 329 | + int packing_num = awq_ptr->get_packing_num(); |
| 330 | + |
| 331 | + // Packed weight: [out_features, in_features / 8] |
| 332 | + weight_ = infinicore::nn::Parameter({in_features, out_features / packing_num}, |
| 333 | + infinicore::DataType::I32, |
| 334 | + device, 0, tp_rank_, tp_size_); |
| 335 | + this->register_parameter("qweight", weight_); |
| 336 | + |
| 337 | + // Weight scale: [out_features, in_features / group_size] |
| 338 | + |
| 339 | + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, |
| 340 | + dtype_, |
| 341 | + device, 0, tp_rank_, tp_size_); |
| 342 | + this->register_parameter("scales", weight_scale_); |
| 343 | + // Weight zeros (zero points): [out_features, in_features / group_size] |
| 344 | + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features / packing_num}, |
| 345 | + infinicore::DataType::I32, |
| 346 | + device, 0, tp_rank_, tp_size_); |
| 347 | + this->register_parameter("qzeros", weight_zeros_); |
| 348 | + |
| 349 | + // Bias handling in RowParallelLinear: |
| 350 | + // - Only rank 0 holds the full bias (after all-reduce on output) |
| 351 | + // - Other ranks have empty bias parameter |
| 352 | + if (bias && (0 == tp_rank_)) { |
| 353 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); |
| 354 | + } else { |
| 355 | + bias_ = Parameter(); |
| 356 | + } |
| 357 | + break; |
| 358 | + } |
264 | 359 | default: { |
265 | 360 | // Initialize parameters using macro |
266 | 361 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, |
|
0 commit comments