|
4 | 4 | #include "infinicore/ops/distributed/allreduce.hpp" |
5 | 5 | #include "infinicore/ops/linear.hpp" |
6 | 6 | #include "infinicore/ops/linear_w4a16_awq.hpp" |
| 7 | +#include "infinicore/ops/linear_w4a16_gptq_qy.hpp" |
7 | 8 | #include "infinicore/ops/linear_w8a8i8.hpp" |
8 | 9 | #include <optional> |
9 | 10 | #include <spdlog/spdlog.h> |
@@ -53,6 +54,19 @@ Tensor BaseLinear::compute_linear(Tensor &input) const { |
53 | 54 | auto output = infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt); |
54 | 55 | return output; |
55 | 56 | } |
| 57 | + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { |
| 58 | + Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); |
| 59 | + Tensor qweight = static_cast<const Tensor &>(weight_); |
| 60 | + Tensor qzeros = static_cast<const Tensor &>(weight_zeros_); |
| 61 | + Tensor scales = static_cast<const Tensor &>(weight_scale_); |
| 62 | + Tensor g_idx = static_cast<const Tensor &>(gidx_); |
| 63 | + std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt; |
| 64 | + auto output = infinicore::op::linear_w4a16_gptq_qy(input_contiguous->contiguous(), qweight, qzeros, scales, 0, 4); |
| 65 | + if (bias_opt.has_value()) { |
| 66 | + infinicore::op::add_(output, output, bias_opt.value()->as_strided(output->shape(), {0, 0, 1})); |
| 67 | + } |
| 68 | + return output; |
| 69 | + } |
56 | 70 | default: { |
57 | 71 | // Ensure input is contiguous before creating views (required for matmul) |
58 | 72 | // This prevents hanging when input tensor has non-contiguous memory layout |
@@ -140,6 +154,23 @@ Linear::Linear(size_t in_features, size_t out_features, |
140 | 154 | } |
141 | 155 | break; |
142 | 156 | } |
| 157 | + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { |
| 158 | + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device); |
| 159 | + this->register_parameter("qweight", weight_); |
| 160 | + weight_zeros_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device); |
| 161 | + this->register_parameter("qzeros", weight_zeros_); |
| 162 | + weight_scale_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device); |
| 163 | + this->register_parameter("scales", weight_scale_); |
| 164 | + |
| 165 | + gidx_ = infinicore::nn::Parameter({in_features}, infinicore::DataType::I32, device); |
| 166 | + this->register_parameter("g_idx", gidx_); |
| 167 | + if (bias) { |
| 168 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device)); |
| 169 | + } else { |
| 170 | + bias_ = Parameter(); |
| 171 | + } |
| 172 | + break; |
| 173 | + } |
143 | 174 | default: { |
144 | 175 | // Initialize parameters using macro |
145 | 176 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device)); |
@@ -247,6 +278,27 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur |
247 | 278 | } |
248 | 279 | break; |
249 | 280 | } |
| 281 | + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { |
| 282 | + auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_); |
| 283 | + int group_size = gptq_ptr->get_group_size(); |
| 284 | + int packing_num = gptq_ptr->get_packing_num(); |
| 285 | + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 1, tp_rank_, tp_size_); |
| 286 | + this->register_parameter("qweight", weight_); |
| 287 | + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_); |
| 288 | + this->register_parameter("qzeros", weight_zeros_); |
| 289 | + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_); |
| 290 | + this->register_parameter("scales", weight_scale_); |
| 291 | + gidx_ = infinicore::nn::Parameter({in_features}, |
| 292 | + infinicore::DataType::I32, |
| 293 | + device, 0, tp_rank_, tp_size_); |
| 294 | + this->register_parameter("g_idx", gidx_); |
| 295 | + if (bias) { |
| 296 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_)); |
| 297 | + } else { |
| 298 | + bias_ = Parameter(); |
| 299 | + } |
| 300 | + break; |
| 301 | + } |
250 | 302 | default: { |
251 | 303 | // Initialize parameters using macro |
252 | 304 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, |
@@ -356,6 +408,34 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, st |
356 | 408 | } |
357 | 409 | break; |
358 | 410 | } |
| 411 | + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { |
| 412 | + // GPTQ W4A16 QY for RowParallelLinear:切分维度为 in_features(权重矩阵的第1维) |
| 413 | + // - Weight: packed int4 in U8 containers (8 int4 per U8) |
| 414 | + // - Group-wise quantization with group_size=128 |
| 415 | + // - Scale and zero points stored per group along in_features dimension |
| 416 | + |
| 417 | + auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_); |
| 418 | + int group_size = gptq_ptr->get_group_size(); |
| 419 | + int packing_num = gptq_ptr->get_packing_num(); |
| 420 | + |
| 421 | + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 0, tp_rank_, tp_size_); |
| 422 | + this->register_parameter("qweight", weight_); |
| 423 | + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_); |
| 424 | + this->register_parameter("qzeros", weight_zeros_); |
| 425 | + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_); |
| 426 | + this->register_parameter("scales", weight_scale_); |
| 427 | + |
| 428 | + gidx_ = infinicore::nn::Parameter({in_features}, |
| 429 | + infinicore::DataType::I32, |
| 430 | + device, 0, tp_rank_, tp_size_); |
| 431 | + this->register_parameter("g_idx", gidx_); |
| 432 | + if (bias && (0 == tp_rank_)) { |
| 433 | + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); |
| 434 | + } else { |
| 435 | + bias_ = Parameter(); |
| 436 | + } |
| 437 | + break; |
| 438 | + } |
359 | 439 | default: { |
360 | 440 | // Initialize parameters using macro |
361 | 441 | INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, |
|
0 commit comments