Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/infinicore/nn/linear.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class BaseLinear : public Module {
Tensor bias() const { return bias_; }
Tensor weight_scale() const { return weight_scale_; }
Tensor weight_zeros() const { return weight_zeros_; }
Tensor gidx() const { return gidx_; }

std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization() const { return quantization_; }

Expand All @@ -45,6 +46,8 @@ class BaseLinear : public Module {
INFINICORE_NN_PARAMETER(weight_scale);
INFINICORE_NN_PARAMETER(weight_zeros);

INFINICORE_NN_PARAMETER(gidx);

protected:
// Helper method for common forward computation
Tensor compute_linear(Tensor &input) const;
Expand Down
12 changes: 12 additions & 0 deletions include/infinicore/ops/linear_w4a16_gptq_qy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

Tensor linear_w4a16_gptq_qy(Tensor in, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit);

void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit);

} // namespace infinicore::op
13 changes: 13 additions & 0 deletions include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(GptqQyblasGemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t);

void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit);
} // namespace infinicore::op
1 change: 1 addition & 0 deletions include/infinicore/quantization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
#include "quantization/awq.hpp"
#include "quantization/base_quantization.hpp"
#include "quantization/compressed_tensors.hpp"
#include "quantization/gptq_qy.hpp"
#include "quantization/none_quantizaiton.hpp"
#include "quantization/quantization_scheme.hpp"
2 changes: 1 addition & 1 deletion include/infinicore/quantization/base_quantization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace infinicore::quantization {
class BaseQuantization {
// Base class for quantization schemes. Intended to be extended to support various quantization methods.
public:
explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config){};
explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
virtual ~BaseQuantization() = default;

virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0;
Expand Down
30 changes: 30 additions & 0 deletions include/infinicore/quantization/gptq_qy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once
#include "base_quantization.hpp"
namespace infinicore::quantization {

class GPTQ_QY : public BaseQuantization {
// This is a temporary class that currently only returns GPTQ W4A16.
// Future enhancements should parse quant_config to extract detailed quantization
// information and support multiple quantization schemes.
public:
explicit GPTQ_QY(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};

infinicore::quantization::QuantScheme
get_quant_scheme() const override {
return infinicore::quantization::QuantScheme::GPTQ_W4A16_QY;
};

int get_packing_num() const {
// For GPTQ, we pack 8 int4 weights into a single int32 value.
return 32 / this->get_or<int>("bits", 4); // Default to 8 if not specified in config
}

int get_group_size() const {
// For simplicity, we return a fixed group size here. In a more complete implementation,
// this could be extracted from quant_config_ to support different group sizes.
return this->get_or<int>("group_size", 128); // Standard GPTQ group size
}
};

} // namespace infinicore::quantization
1 change: 1 addition & 0 deletions include/infinicore/quantization/quantization_scheme.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ enum class QuantScheme {
NONE,
COMPRESSED_TENSOR_W8A8I8,
AWQ_W4A16,
GPTQ_W4A16_QY,
};

enum class KVQuantAlgo {
Expand Down
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "infiniop/ops/fmod.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/gptq_qyblas_gemm.h"
#include "infiniop/ops/hardswish.h"
#include "infiniop/ops/hardtanh.h"
#include "infiniop/ops/hinge_embedding_loss.h"
Expand Down
24 changes: 12 additions & 12 deletions include/infiniop/ops/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@
typedef struct InfiniopDescriptor *infiniopGemmDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle,
infiniopGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);

__INFINI_C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta,
void *stream);
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc);

Expand Down
37 changes: 37 additions & 0 deletions include/infiniop/ops/gptq_qyblas_gemm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__
#define __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopGptqQyblasGemmDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateGptqQyblasGemmDescriptor(
infiniopHandle_t handle,
infiniopGptqQyblasGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t b_scales_desc,
infiniopTensorDescriptor_t b_zeros_desc);

__INFINI_C __export infiniStatus_t infiniopGetGptqQyblasGemmWorkspaceSize(
infiniopGptqQyblasGemmDescriptor_t desc,
size_t *size);

__INFINI_C __export infiniStatus_t infiniopGptqQyblasGemm(
infiniopGptqQyblasGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *a,
const void *b,
void *b_scale,
void *b_zero,
int64_t quant_type,
int64_t bit,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyGptqQyblasGemmDescriptor(
infiniopGptqQyblasGemmDescriptor_t desc);
#endif
80 changes: 80 additions & 0 deletions src/infinicore/nn/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "infinicore/ops/distributed/allreduce.hpp"
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear_w4a16_awq.hpp"
#include "infinicore/ops/linear_w4a16_gptq_qy.hpp"
#include "infinicore/ops/linear_w8a8i8.hpp"
#include <optional>
#include <spdlog/spdlog.h>
Expand Down Expand Up @@ -53,6 +54,19 @@ Tensor BaseLinear::compute_linear(Tensor &input) const {
auto output = infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt);
return output;
}
case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
Tensor qweight = static_cast<const Tensor &>(weight_);
Tensor qzeros = static_cast<const Tensor &>(weight_zeros_);
Tensor scales = static_cast<const Tensor &>(weight_scale_);
Tensor g_idx = static_cast<const Tensor &>(gidx_);
std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
auto output = infinicore::op::linear_w4a16_gptq_qy(input_contiguous->contiguous(), qweight, qzeros, scales, 0, 4);
if (bias_opt.has_value()) {
infinicore::op::add_(output, output, bias_opt.value()->as_strided(output->shape(), {0, 0, 1}));
}
return output;
}
default: {
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Expand Down Expand Up @@ -140,6 +154,23 @@ Linear::Linear(size_t in_features, size_t out_features,
}
break;
}
case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device);
this->register_parameter("qweight", weight_);
weight_zeros_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device);
this->register_parameter("qzeros", weight_zeros_);
weight_scale_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device);
this->register_parameter("scales", weight_scale_);

gidx_ = infinicore::nn::Parameter({in_features}, infinicore::DataType::I32, device);
this->register_parameter("g_idx", gidx_);
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
Expand Down Expand Up @@ -247,6 +278,27 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
}
break;
}
case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_);
int group_size = gptq_ptr->get_group_size();
int packing_num = gptq_ptr->get_packing_num();
weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 1, tp_rank_, tp_size_);
this->register_parameter("qweight", weight_);
weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_);
this->register_parameter("qzeros", weight_zeros_);
weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_);
this->register_parameter("scales", weight_scale_);
gidx_ = infinicore::nn::Parameter({in_features},
infinicore::DataType::I32,
device, 0, tp_rank_, tp_size_);
this->register_parameter("g_idx", gidx_);
if (bias) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
Expand Down Expand Up @@ -356,6 +408,34 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, st
}
break;
}
case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
// GPTQ W4A16 QY for RowParallelLinear:切分维度为 in_features(权重矩阵的第1维)
// - Weight: packed int4 in U8 containers (8 int4 per U8)
// - Group-wise quantization with group_size=128
// - Scale and zero points stored per group along in_features dimension

auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_);
int group_size = gptq_ptr->get_group_size();
int packing_num = gptq_ptr->get_packing_num();

weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 0, tp_rank_, tp_size_);
this->register_parameter("qweight", weight_);
weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_);
this->register_parameter("qzeros", weight_zeros_);
weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_);
this->register_parameter("scales", weight_scale_);

gidx_ = infinicore::nn::Parameter({in_features},
infinicore::DataType::I32,
device, 0, tp_rank_, tp_size_);
this->register_parameter("g_idx", gidx_);
if (bias && (0 == tp_rank_)) {
INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
} else {
bias_ = Parameter();
}
break;
}
default: {
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
Expand Down
54 changes: 54 additions & 0 deletions src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "infinicore/ops/linear_w4a16_gptq_qy.hpp"
#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp"
#include <iostream>
namespace infinicore::op {

Tensor linear_w4a16_gptq_qy(Tensor input, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit) {

Size ndim = input->ndim();

Size out_features = qweight->shape()[1];

// 2. 计算输出形状 [..., out_features]
auto output_shape = input->shape();
output_shape[ndim - 1] = out_features;

// 3. 分配输出显存
auto out = Tensor::zeros(output_shape, input->dtype(), input->device());

// 4. 执行计算
linear_w4a16_gptq_qy_(out, input, qweight, scales, qzeros, quant_type, bit);

return out;
}

void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit) {

Size in_features = qweights->shape()[0] * 2; // ✅ 修正:第 0 维是 in/2
Size out_features = qweights->shape()[1]; // ✅ 修正:第 1 维是 out

// 检查输入输出维度
Size ndim = in->ndim();

// ========================================================================
// 合并 Batch 维度
// ========================================================================
Size N = 1;
auto input_shape = in->shape();
for (size_t i = 0; i < ndim - 1; ++i) {
N *= input_shape[i];
}

op::scaled_mm_w4a16_gptq_qy_(
out->view({N, out_features}), // Output: [N, out]
in->view({N, in_features}), // Input: [N, in]
qweights, // Weight: [in/2, out]
scales, // Scales: [in/group, out]
qzeros, // QZeros: [in/group, out]
quant_type, // Quantization type
bit // Bit width
);
// out->debug();
}

} // namespace infinicore::op
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp"
#include "../../utils.hpp"
#include <iostream>

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(GptqQyblasGemm);

GptqQyblasGemm::GptqQyblasGemm(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, in, qweight, scales, qzeros);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, in, qweight, scales, qzeros, quant_type, bit);
}

void GptqQyblasGemm::execute(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(GptqQyblasGemm, out, in, qweight, scales, qzeros, quant_type, bit);
}

void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {

GptqQyblasGemm::execute(out, in, qweight, scales, qzeros, quant_type, bit);
}

} // namespace infinicore::op
Loading
Loading