diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp index 89a832ceb..81e9944c0 100644 --- a/include/infinicore/nn/linear.hpp +++ b/include/infinicore/nn/linear.hpp @@ -34,6 +34,7 @@ class BaseLinear : public Module { Tensor bias() const { return bias_; } Tensor weight_scale() const { return weight_scale_; } Tensor weight_zeros() const { return weight_zeros_; } + Tensor gidx() const { return gidx_; } std::shared_ptr get_quantization() const { return quantization_; } @@ -45,6 +46,8 @@ class BaseLinear : public Module { INFINICORE_NN_PARAMETER(weight_scale); INFINICORE_NN_PARAMETER(weight_zeros); + INFINICORE_NN_PARAMETER(gidx); + protected: // Helper method for common forward computation Tensor compute_linear(Tensor &input) const; diff --git a/include/infinicore/ops/linear_w4a16_gptq_qy.hpp b/include/infinicore/ops/linear_w4a16_gptq_qy.hpp new file mode 100644 index 000000000..72cc9eef4 --- /dev/null +++ b/include/infinicore/ops/linear_w4a16_gptq_qy.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "common/op.hpp" +#include + +namespace infinicore::op { + +Tensor linear_w4a16_gptq_qy(Tensor in, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit); + +void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit); + +} // namespace infinicore::op diff --git a/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp b/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp new file mode 100644 index 000000000..33ee7c37b --- /dev/null +++ b/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "../device.hpp" +#include "../graph/graph.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_CLASS(GptqQyblasGemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t); + +void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit); +} // namespace infinicore::op diff --git a/include/infinicore/quantization.hpp b/include/infinicore/quantization.hpp index 7b01312ba..61adaadfa 100644 --- a/include/infinicore/quantization.hpp +++ b/include/infinicore/quantization.hpp @@ -3,5 +3,6 @@ #include "quantization/awq.hpp" #include "quantization/base_quantization.hpp" #include "quantization/compressed_tensors.hpp" +#include "quantization/gptq_qy.hpp" #include "quantization/none_quantizaiton.hpp" #include "quantization/quantization_scheme.hpp" diff --git a/include/infinicore/quantization/base_quantization.hpp b/include/infinicore/quantization/base_quantization.hpp index 53f97deef..38f45f316 100644 --- a/include/infinicore/quantization/base_quantization.hpp +++ b/include/infinicore/quantization/base_quantization.hpp @@ -6,7 +6,7 @@ namespace infinicore::quantization { class BaseQuantization { // Base class for quantization schemes. Intended to be extended to support various quantization methods. public: - explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config){}; + explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; virtual ~BaseQuantization() = default; virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0; diff --git a/include/infinicore/quantization/gptq_qy.hpp b/include/infinicore/quantization/gptq_qy.hpp new file mode 100644 index 000000000..d1712b257 --- /dev/null +++ b/include/infinicore/quantization/gptq_qy.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "base_quantization.hpp" +namespace infinicore::quantization { + +class GPTQ_QY : public BaseQuantization { + // This is a temporary class that currently only returns GPTQ W4A16. + // Future enhancements should parse quant_config to extract detailed quantization + // information and support multiple quantization schemes. +public: + explicit GPTQ_QY(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + infinicore::quantization::QuantScheme + get_quant_scheme() const override { + return infinicore::quantization::QuantScheme::GPTQ_W4A16_QY; + }; + + int get_packing_num() const { + // For GPTQ, we pack 8 int4 weights into a single int32 value. + return 32 / this->get_or("bits", 4); // Default to 8 if not specified in config + } + + int get_group_size() const { + // For simplicity, we return a fixed group size here. In a more complete implementation, + // this could be extracted from quant_config_ to support different group sizes. + return this->get_or("group_size", 128); // Standard GPTQ group size + } +}; + +} // namespace infinicore::quantization diff --git a/include/infinicore/quantization/quantization_scheme.hpp b/include/infinicore/quantization/quantization_scheme.hpp index 9c08ea6e0..f2c98321f 100644 --- a/include/infinicore/quantization/quantization_scheme.hpp +++ b/include/infinicore/quantization/quantization_scheme.hpp @@ -7,6 +7,7 @@ enum class QuantScheme { NONE, COMPRESSED_TENSOR_W8A8I8, AWQ_W4A16, + GPTQ_W4A16_QY, }; enum class KVQuantAlgo { diff --git a/include/infiniop.h b/include/infiniop.h index f38fc9744..aeefe952a 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -48,6 +48,7 @@ #include "infiniop/ops/fmod.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/gptq_qyblas_gemm.h" #include "infiniop/ops/hardswish.h" #include "infiniop/ops/hardtanh.h" #include "infiniop/ops/hinge_embedding_loss.h" diff --git a/include/infiniop/ops/gemm.h b/include/infiniop/ops/gemm.h index 3d4d4dc0c..430e37003 100644 --- a/include/infiniop/ops/gemm.h +++ b/include/infiniop/ops/gemm.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopGemmDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle, - infiniopGemmDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc); + infiniopGemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); __INFINI_C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - void const *a, - void const *b, - float alpha, - float beta, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + void const *a, + void const *b, + float alpha, + float beta, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc); diff --git a/include/infiniop/ops/gptq_qyblas_gemm.h b/include/infiniop/ops/gptq_qyblas_gemm.h new file mode 100644 index 000000000..bb105132c --- /dev/null +++ b/include/infiniop/ops/gptq_qyblas_gemm.h @@ -0,0 +1,37 @@ +#ifndef __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__ +#define __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__ + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopGptqQyblasGemmDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateGptqQyblasGemmDescriptor( + infiniopHandle_t handle, + infiniopGptqQyblasGemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t b_scales_desc, + infiniopTensorDescriptor_t b_zeros_desc); + +__INFINI_C __export infiniStatus_t infiniopGetGptqQyblasGemmWorkspaceSize( + infiniopGptqQyblasGemmDescriptor_t desc, + size_t *size); + +__INFINI_C __export infiniStatus_t infiniopGptqQyblasGemm( + infiniopGptqQyblasGemmDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *b_scale, + void *b_zero, + int64_t quant_type, + int64_t bit, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyGptqQyblasGemmDescriptor( + infiniopGptqQyblasGemmDescriptor_t desc); +#endif diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc index 05342e651..ebe607bab 100644 --- a/src/infinicore/nn/linear.cc +++ b/src/infinicore/nn/linear.cc @@ -4,6 +4,7 @@ #include "infinicore/ops/distributed/allreduce.hpp" #include "infinicore/ops/linear.hpp" #include "infinicore/ops/linear_w4a16_awq.hpp" +#include "infinicore/ops/linear_w4a16_gptq_qy.hpp" #include "infinicore/ops/linear_w8a8i8.hpp" #include #include @@ -53,6 +54,19 @@ Tensor BaseLinear::compute_linear(Tensor &input) const { auto output = infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt); return output; } + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { + Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous(); + Tensor qweight = static_cast(weight_); + Tensor qzeros = static_cast(weight_zeros_); + Tensor scales = static_cast(weight_scale_); + Tensor g_idx = static_cast(gidx_); + std::optional bias_opt = has_bias_ ? std::make_optional(static_cast(bias_)) : std::nullopt; + auto output = infinicore::op::linear_w4a16_gptq_qy(input_contiguous->contiguous(), qweight, qzeros, scales, 0, 4); + if (bias_opt.has_value()) { + infinicore::op::add_(output, output, bias_opt.value()->as_strided(output->shape(), {0, 0, 1})); + } + return output; + } default: { // Ensure input is contiguous before creating views (required for matmul) // This prevents hanging when input tensor has non-contiguous memory layout @@ -140,6 +154,23 @@ Linear::Linear(size_t in_features, size_t out_features, } break; } + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device); + this->register_parameter("qweight", weight_); + weight_zeros_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device); + this->register_parameter("qzeros", weight_zeros_); + weight_scale_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device); + this->register_parameter("scales", weight_scale_); + + gidx_ = infinicore::nn::Parameter({in_features}, infinicore::DataType::I32, device); + this->register_parameter("g_idx", gidx_); + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device)); + } else { + bias_ = Parameter(); + } + break; + } default: { // Initialize parameters using macro INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device)); @@ -247,6 +278,27 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur } break; } + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { + auto gptq_ptr = std::static_pointer_cast(this->quantization_); + int group_size = gptq_ptr->get_group_size(); + int packing_num = gptq_ptr->get_packing_num(); + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 1, tp_rank_, tp_size_); + this->register_parameter("qweight", weight_); + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_); + this->register_parameter("qzeros", weight_zeros_); + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_); + this->register_parameter("scales", weight_scale_); + gidx_ = infinicore::nn::Parameter({in_features}, + infinicore::DataType::I32, + device, 0, tp_rank_, tp_size_); + this->register_parameter("g_idx", gidx_); + if (bias) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_)); + } else { + bias_ = Parameter(); + } + break; + } default: { // Initialize parameters using macro INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, @@ -356,6 +408,34 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, st } break; } + case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { + // GPTQ W4A16 QY for RowParallelLinear:切分维度为 in_features(权重矩阵的第1维) + // - Weight: packed int4 in U8 containers (8 int4 per U8) + // - Group-wise quantization with group_size=128 + // - Scale and zero points stored per group along in_features dimension + + auto gptq_ptr = std::static_pointer_cast(this->quantization_); + int group_size = gptq_ptr->get_group_size(); + int packing_num = gptq_ptr->get_packing_num(); + + weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 0, tp_rank_, tp_size_); + this->register_parameter("qweight", weight_); + weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_); + this->register_parameter("qzeros", weight_zeros_); + weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_); + this->register_parameter("scales", weight_scale_); + + gidx_ = infinicore::nn::Parameter({in_features}, + infinicore::DataType::I32, + device, 0, tp_rank_, tp_size_); + this->register_parameter("g_idx", gidx_); + if (bias && (0 == tp_rank_)) { + INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1)); + } else { + bias_ = Parameter(); + } + break; + } default: { // Initialize parameters using macro INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device, diff --git a/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc b/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc new file mode 100644 index 000000000..bb088d709 --- /dev/null +++ b/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc @@ -0,0 +1,54 @@ +#include "infinicore/ops/linear_w4a16_gptq_qy.hpp" +#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp" +#include +namespace infinicore::op { + +Tensor linear_w4a16_gptq_qy(Tensor input, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit) { + + Size ndim = input->ndim(); + + Size out_features = qweight->shape()[1]; + + // 2. 计算输出形状 [..., out_features] + auto output_shape = input->shape(); + output_shape[ndim - 1] = out_features; + + // 3. 分配输出显存 + auto out = Tensor::zeros(output_shape, input->dtype(), input->device()); + + // 4. 执行计算 + linear_w4a16_gptq_qy_(out, input, qweight, scales, qzeros, quant_type, bit); + + return out; +} + +void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit) { + + Size in_features = qweights->shape()[0] * 2; // ✅ 修正:第 0 维是 in/2 + Size out_features = qweights->shape()[1]; // ✅ 修正:第 1 维是 out + + // 检查输入输出维度 + Size ndim = in->ndim(); + + // ======================================================================== + // 合并 Batch 维度 + // ======================================================================== + Size N = 1; + auto input_shape = in->shape(); + for (size_t i = 0; i < ndim - 1; ++i) { + N *= input_shape[i]; + } + + op::scaled_mm_w4a16_gptq_qy_( + out->view({N, out_features}), // Output: [N, out] + in->view({N, in_features}), // Input: [N, in] + qweights, // Weight: [in/2, out] + scales, // Scales: [in/group, out] + qzeros, // QZeros: [in/group, out] + quant_type, // Quantization type + bit // Bit width + ); + // out->debug(); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc new file mode 100644 index 000000000..3afb715f0 --- /dev/null +++ b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc @@ -0,0 +1,23 @@ +#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp" +#include "../../utils.hpp" +#include + +namespace infinicore::op { + +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(GptqQyblasGemm); + +GptqQyblasGemm::GptqQyblasGemm(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, in, qweight, scales, qzeros); + INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, in, qweight, scales, qzeros, quant_type, bit); +} + +void GptqQyblasGemm::execute(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN(GptqQyblasGemm, out, in, qweight, scales, qzeros, quant_type, bit); +} + +void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) { + + GptqQyblasGemm::execute(out, in, qweight, scales, qzeros, quant_type, bit); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc new file mode 100644 index 000000000..5ead44772 --- /dev/null +++ b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "../infiniop_impl.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp" +#include + +namespace infinicore::op::scaled_mm_w4a16_gptq_impl::infiniop { + +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, GptqQyblasGemm, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, out, in, qweight, scales, qzeros; + int64_t quant_type, bit; +}; + +void *plan(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) { + size_t seed = hash_combine(out, in, qweight, scales, qzeros); + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, GptqQyblasGemm, + seed, + out->desc(), in->desc(), qweight->desc(), scales->desc(), qzeros->desc()); + INFINIOP_WORKSPACE_TENSOR(workspace, GptqQyblasGemm, descriptor); + + return new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(out), + graph::GraphTensor(in), + graph::GraphTensor(qweight), + graph::GraphTensor(scales), + graph::GraphTensor(qzeros), + quant_type, bit}; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + INFINICORE_CHECK_ERROR(infiniopGptqQyblasGemm( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->out->data(), + planned->in->data(), + planned->qweight->data(), + planned->scales->data(), + planned->qzeros->data(), + planned->quant_type, planned->bit, + context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(GptqQyblasGemm, &plan, &run, &cleanup); + +} // namespace infinicore::op::scaled_mm_w4a16_gptq_impl::infiniop diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc index b57b00a52..9b4e97777 100644 --- a/src/infinicore/tensor/debug.cc +++ b/src/infinicore/tensor/debug.cc @@ -109,6 +109,20 @@ void print_data_i8(const int8_t *data, const Shape &shape, const Strides &stride } } +// Function for printing U8 data +void print_data_u8(const uint8_t *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + std::cout << static_cast(data[i * strides[dim]]) << " "; + } + std::cout << std::endl; + } else if (dim < shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + print_data_u8(data + i * strides[dim], shape, strides, dim + 1); + } + } +} + // Template function for writing data recursively to binary file (handles non-contiguous tensors) template void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) { @@ -191,8 +205,8 @@ void TensorImpl::debug(const std::string &filename) const { cpu_tensor->shape(), cpu_tensor->strides(), 0); break; case DataType::U8: - print_data(reinterpret_cast(cpu_data), - cpu_tensor->shape(), cpu_tensor->strides(), 0); + print_data_u8(reinterpret_cast(cpu_data), + cpu_tensor->shape(), cpu_tensor->strides(), 0); break; case DataType::I8: print_data_i8(reinterpret_cast(cpu_data), diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc index ac4e01e83..81d9cb066 100644 --- a/src/infiniop/ops/gemm/operator.cc +++ b/src/infiniop/ops/gemm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/gemm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API) #include "nvidia/gemm_nvidia.cuh" #endif #ifdef ENABLE_CAMBRICON_API @@ -23,7 +23,9 @@ #ifdef ENABLE_KUNLUN_API #include "kunlun/gemm_kunlun.h" #endif - +#ifdef ENABLE_QY_API +#include "qy/gemm_qy.cuh" +#endif __INFINI_C infiniStatus_t infiniopCreateGemmDescriptor( infiniopHandle_t handle, infiniopGemmDescriptor_t *desc_ptr, @@ -55,7 +57,7 @@ __INFINI_C infiniStatus_t infiniopCreateGemmDescriptor( CREATE(INFINI_DEVICE_ALI, nvidia); #endif #ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); + CREATE(INFINI_DEVICE_QY, qy); #endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); @@ -109,7 +111,7 @@ infiniopGetGemmWorkspaceSize( GET(INFINI_DEVICE_ALI, nvidia); #endif #ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); + GET(INFINI_DEVICE_QY, qy); #endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); @@ -170,7 +172,7 @@ __INFINI_C infiniStatus_t infiniopGemm( CALCULATE(INFINI_DEVICE_ALI, nvidia); #endif #ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); + CALCULATE(INFINI_DEVICE_QY, qy); #endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); @@ -221,7 +223,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { DELETE(INFINI_DEVICE_ALI, nvidia); #endif #ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); + DELETE(INFINI_DEVICE_QY, qy); #endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); diff --git a/src/infiniop/ops/gemm/qy/gemm_qy.cu b/src/infiniop/ops/gemm/qy/gemm_qy.cu new file mode 100644 index 000000000..34241a34f --- /dev/null +++ b/src/infiniop/ops/gemm/qy/gemm_qy.cu @@ -0,0 +1,134 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "gemm_qy.cuh" + +namespace op::gemm::qy { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + auto handle = reinterpret_cast(handle_); + auto dtype = c_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + dtype, result.take(), 0, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *c, + float beta, + const void *a, + const void *b, + float alpha, + void *stream) const { + + cudaDataType a_type, b_type, c_type; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cudaDataType compute_type; +#else + cublasComputeType_t compute_type; +#endif + + switch (_dtype) { + case INFINI_DTYPE_F16: + a_type = b_type = c_type = CUDA_R_16F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + compute_type = CUDA_R_32F; +#else + compute_type = CUBLAS_COMPUTE_32F; +#endif + break; + case INFINI_DTYPE_BF16: + a_type = b_type = c_type = CUDA_R_16BF; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + compute_type = CUDA_R_32F; +#else + compute_type = CUBLAS_COMPUTE_32F; +#endif + break; + case INFINI_DTYPE_F32: + a_type = b_type = c_type = CUDA_R_32F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + compute_type = CUDA_R_32F; +#else + compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; +#endif + break; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (_info.is_transed) { + std::swap(a, b); + } + + auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; + auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; + + CHECK_STATUS(_opaque->internal->useCublas( + (cudaStream_t)stream, + [&](cublasHandle_t handle) { + // 1. 获取batch数和各矩阵的元素大小(字节) + const int batch = static_cast(_info.batch); + const size_t a_elem_size = 2; // getCudaDataTypeSize(a_type); + const size_t b_elem_size = 2; // getCudaDataTypeSize(b_type); + const size_t c_elem_size = 2; // getCudaDataTypeSize(c_type); + + // 2. 循环处理每个batch + for (int i = 0; i < batch; ++i) { + // 计算当前batch的A/B/C指针(stride是元素步长,转字节步长) + const void *a_batch = (const char *)a + i * _info.a_matrix.stride * a_elem_size; + const void *b_batch = (const char *)b + i * _info.b_matrix.stride * b_elem_size; + void *c_batch = (char *)c + i * _info.c_matrix.stride * c_elem_size; + + // 3. 调用单batch的cublasGemmEx(参数与原接口完全对齐) + CHECK_CUBLAS( + cublasGemmEx( + handle, + op_a, // 原op_a + op_b, // 原op_b + static_cast(_info.m), // 原m + static_cast(_info.n), // 原n + static_cast(_info.k), // 原k + &alpha, // 原alpha + a_batch, // 当前batch的A指针 + a_type, // 原a_type + static_cast(_info.a_matrix.ld()), // 原a的ld + b_batch, // 当前batch的B指针 + b_type, // 原b_type + static_cast(_info.b_matrix.ld()), // 原b的ld + &beta, // 原beta + c_batch, // 当前batch的C指针 + c_type, // 原c_type + static_cast(_info.c_matrix.ld()), // 原c的ld + compute_type, // 原compute_type + CUBLAS_GEMM_DEFAULT_TENSOR_OP // 原算法选择 + )); + } + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gemm::qy diff --git a/src/infiniop/ops/gemm/qy/gemm_qy.cuh b/src/infiniop/ops/gemm/qy/gemm_qy.cuh new file mode 100644 index 000000000..0b218bac9 --- /dev/null +++ b/src/infiniop/ops/gemm/qy/gemm_qy.cuh @@ -0,0 +1,8 @@ +#ifndef __GEMM_QY_CUH__ +#define __GEMM_QY_CUH__ + +#include "../gemm.h" + +DESCRIPTOR(qy) + +#endif // __GEMM_QY_CUH__ diff --git a/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h b/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h new file mode 100644 index 000000000..456e540e1 --- /dev/null +++ b/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h @@ -0,0 +1,49 @@ +#ifndef GPTQ_QYBLAS_GEMM_H +#define GPTQ_QYBLAS_GEMM_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::gptq_qyblas_gemm::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + GptqQyblasGemmInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + GptqQyblasGemmInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc, \ + infiniopTensorDescriptor_t b_scales_desc, \ + infiniopTensorDescriptor_t b_zeros_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *out, \ + const void *a, const void *b, void *b_scale, void *b_zero, int64_t quant_type, int64_t bit, \ + void *stream) const; \ + }; \ + } + +#endif // GPTQ_QYBLAS_GEMM_H diff --git a/src/infiniop/ops/gptq_qyblas_gemm/info.h b/src/infiniop/ops/gptq_qyblas_gemm/info.h new file mode 100644 index 000000000..24820d53b --- /dev/null +++ b/src/infiniop/ops/gptq_qyblas_gemm/info.h @@ -0,0 +1,117 @@ +#ifndef __GPTQ_QYBLAS_GEMM_INFO_H__ +#define __GPTQ_QYBLAS_GEMM_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include +#include + +inline void prepare_matrix_for_cublas( + infiniopTensorDescriptor_t tensor, + bool &transpose_tensor) { + + auto strides = tensor->strides(); + auto sizes = tensor->shape(); + + if ((strides[0] == 1) && (strides[1] >= std::max(1, sizes[0]))) { + + transpose_tensor = false; + return; + } + if ((strides[1] == 1) && (strides[0] >= std::max(1, sizes[1]))) { + + transpose_tensor = true; + return; + } + transpose_tensor = true; +} + +namespace op::gptq_qyblas_gemm { + +class GptqQyblasGemmInfo { + GptqQyblasGemmInfo() = default; + +public: + infiniDtype_t dtype, weight_dtype, scales_dtype, zeros_dtype, out_dtype; + size_t M, K, N, scales_size_0, scales_size_1; + ptrdiff_t lda, ldb, result_ld; + bool transpose_result; + char transa, transb; + + static utils::Result createGptqQyblasGemmInfo( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t b_scales_desc, + infiniopTensorDescriptor_t b_zeros_desc) { + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + auto out_dtype = out_desc->dtype(); + CHECK_DTYPE(dtype, out_dtype); + + const infiniDtype_t weight_dtype = b_desc->dtype(); + // CHECK_DTYPE(weight_dtype, INFINI_DTYPE_F8, INFINI_DTYPE_U8, INFINI_DTYPE_I8); + + const infiniDtype_t scales_dtype = b_scales_desc->dtype(); + const infiniDtype_t zeros_dtype = b_zeros_desc->dtype(); + + bool transpose_result = false; + bool transpose_mat_1 = false; + bool transpose_mat_2 = false; + + prepare_matrix_for_cublas(out_desc, transpose_result); + + auto mata = (transpose_result ? b_desc : a_desc); + prepare_matrix_for_cublas(transpose_result ? b_desc : a_desc, transpose_mat_1); + auto matb = (transpose_result ? a_desc : b_desc); + prepare_matrix_for_cublas(transpose_result ? a_desc : b_desc, transpose_mat_2); + + auto mat1_sizes = a_desc->shape(); + auto mat2_sizes = b_desc->shape(); + if (transpose_result) { + transpose_mat_1 = !transpose_mat_1; + transpose_mat_2 = !transpose_mat_2; + mat1_sizes = mata->shape(); + mat2_sizes = matb->shape(); + } + + size_t M = mat1_sizes[transpose_result ? 1 : 0]; + size_t K = mat1_sizes[transpose_result ? 0 : 1]; + size_t N = mat2_sizes[transpose_result ? 0 : 1]; + + size_t scales_size_0 = b_scales_desc->shape()[0]; + size_t scales_size_1 = b_scales_desc->shape()[1]; + + auto ndim = out_desc->ndim(); + CHECK_OR_RETURN(ndim == 2 + && a_desc->ndim() == ndim + && b_desc->ndim() == ndim + && b_scales_desc->ndim() == ndim + && b_zeros_desc->ndim() == ndim, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + ptrdiff_t lda = mata->strides()[(transpose_mat_1 == transpose_result) + ? 1 + : 0]; + ptrdiff_t ldb = matb->strides()[(transpose_mat_2 == transpose_result) + ? 1 + : 0]; + ptrdiff_t result_ld = out_desc->strides()[transpose_result ? 0 : 1]; + + char transa = transpose_mat_1 ? 't' : 'n'; + char transb = transpose_mat_2 ? 't' : 'n'; + + return utils::Result(GptqQyblasGemmInfo{ + dtype, weight_dtype, scales_dtype, zeros_dtype, out_dtype, + M, K, N, scales_size_0, scales_size_1, + lda, ldb, result_ld, + transpose_result, + transa, transb}); + } +}; + +} // namespace op::gptq_qyblas_gemm + +#endif // __GPTQ_QYBLAS_GEMM_INFO_H__ diff --git a/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu new file mode 100644 index 000000000..9aab07aff --- /dev/null +++ b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu @@ -0,0 +1,198 @@ +#if defined ENABLE_QY_API +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "dlblas_ext.h" +#include "gptq_qyblas_gemm_nvidia.cuh" + +inline cudaDataType_t ScalarTypeToCudaDataType( + infiniDtype_t scalar_type) { + switch (scalar_type) { + case INFINI_DTYPE_U8: + return CUDA_R_8U; + case INFINI_DTYPE_I8: + return CUDA_R_8I; + case INFINI_DTYPE_I32: + return CUDA_R_32I; + case INFINI_DTYPE_F16: + return CUDA_R_16F; + case INFINI_DTYPE_F32: + return CUDA_R_32F; + case INFINI_DTYPE_F64: + return CUDA_R_64F; + case INFINI_DTYPE_I16: + return CUDA_R_16I; + case INFINI_DTYPE_I64: + return CUDA_R_64I; + case INFINI_DTYPE_BF16: + return CUDA_R_16BF; + case INFINI_DTYPE_F8: + return (cudaDataType_t)CUDA_R_8F_E4M3; + default: + fprintf(stderr, + "Cannot convert ScalarType %d\n", + (int)scalar_type); + abort(); + } +} +namespace op::gptq_qyblas_gemm::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t b_scales_desc, + infiniopTensorDescriptor_t b_zeros_desc) { + + auto info = GptqQyblasGemmInfo::createGptqQyblasGemmInfo(out_desc, a_desc, b_desc, b_scales_desc, b_zeros_desc); + + CHECK_RESULT(info); + + size_t workspace_size = 0; + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), workspace_size, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *b_scales, + void *b_zeros, + int64_t quant_type, + int64_t bit, + void *stream) const { + int64_t K = static_cast(_info.K); + + cudaDataType_t computeType_ = (cudaDataType_t)CUDA_R_32F; + cudaDataType_t kernel_Atype_, kernel_Btype_, kernel_Ctype_, kernel_Stype_, kernel_Ztype_; + auto dtype = _info.dtype; + auto weight_dtype = _info.weight_dtype; + if (_info.transpose_result) { + std::swap(a, b); + std::swap(dtype, weight_dtype); + } + kernel_Atype_ = ScalarTypeToCudaDataType(dtype); + kernel_Btype_ = ScalarTypeToCudaDataType(weight_dtype); + + if (quant_type == 0) { + if (8 == bit) { + kernel_Atype_ = (cudaDataType_t)CUDA_R_8U; + } + + if (4 == bit) { + kernel_Atype_ = (cudaDataType_t)CUDA_R_4U; + K = K * 2; + } + } + + kernel_Ctype_ = ScalarTypeToCudaDataType(_info.out_dtype); + kernel_Stype_ = ScalarTypeToCudaDataType(_info.scales_dtype); + kernel_Ztype_ = ScalarTypeToCudaDataType(_info.zeros_dtype); + + float alpha = 1.0f; + float beta = 0.0f; + + int64_t M = static_cast(_info.M); + int64_t N = static_cast(_info.N); + int64_t lda = static_cast(_info.lda); + int64_t ldb = static_cast(_info.ldb); + + int64_t scales_size_0 = static_cast(_info.scales_size_0); + int64_t scales_size_1 = static_cast(_info.scales_size_1); + + int64_t result_ld = static_cast(_info.result_ld); + + dlblasExtQuantParametersV2_t extParameters; + + if (quant_type == 0) { + extParameters.a_group_size_m = M / scales_size_1; + extParameters.a_group_size_k = K / scales_size_0; + extParameters.a_zeropoints_type = kernel_Ztype_; + extParameters.a_zeropoints = b_zeros; + extParameters.a_scales_type = kernel_Stype_; + extParameters.a_scales = b_scales; + } else if (quant_type == 1) { + extParameters.a_group_size_m = 1; + extParameters.a_group_size_k = K; + extParameters.a_zeropoints = nullptr; + extParameters.a_scales_type = kernel_Stype_; + extParameters.a_scales = b_scales; + + } else if (quant_type == 2 || quant_type == 3) { + // calculate block_shape according weight/scales shape + int block_shape = 128; + while ((M + block_shape - 1) / block_shape < scales_size_0) { + block_shape /= 2; + if (block_shape < 32) { + fprintf(stderr, + "INTERNAL ASSERT FAILED: block_shape >= 32\n" + "Invalid fp blockwise linear arguments. Weight: [%d, %d]. Scales: [%d, %d].\n", + (int)M, (int)K, (int)scales_size_0, (int)scales_size_1); + abort(); + } + } + if (!((K + block_shape - 1) / block_shape == scales_size_1)) { + fprintf(stderr, + "CHECK FAILED: (K + block_shape - 1) / block_shape == scales_size_1\n"); + abort(); + } + extParameters.a_group_size_m = block_shape; + extParameters.a_group_size_k = block_shape; + extParameters.a_scales_type = kernel_Stype_; + extParameters.a_zeropoints = nullptr; + extParameters.a_scales = b_scales; + } + bool transpose_mat_1 = _info.transa == 't'; + bool transpose_mat_2 = _info.transb == 't'; + cublasOperation_t transa = transpose_mat_1 ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transb = transpose_mat_2 ? CUBLAS_OP_T : CUBLAS_OP_N; + + if (_info.dtype == INFINI_DTYPE_F16 || _info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(_opaque->internal->useCublas( + (cudaStream_t)stream, + [&](cublasHandle_t handle) { + CHECK_CUBLAS( + dlblasGemmExV2(handle, + transa, + transb, + M, + N, + K, + &alpha, + a, + kernel_Atype_, + lda, + b, + kernel_Btype_, + ldb, + &beta, + out, + kernel_Ctype_, + result_ld, + computeType_, + CUBLAS_GEMM_DEFAULT_TENSOR_OP, + &extParameters)); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gptq_qyblas_gemm::nvidia +#endif diff --git a/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh new file mode 100644 index 000000000..b489858d9 --- /dev/null +++ b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__ +#define __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__ +#include "../gptq_qyblas_gemm.h" + +DESCRIPTOR(nvidia) + +#endif // __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__ diff --git a/src/infiniop/ops/gptq_qyblas_gemm/operator.cc b/src/infiniop/ops/gptq_qyblas_gemm/operator.cc new file mode 100644 index 000000000..e7bdf7791 --- /dev/null +++ b/src/infiniop/ops/gptq_qyblas_gemm/operator.cc @@ -0,0 +1,103 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gptq_qyblas_gemm.h" + +#if defined ENABLE_QY_API +#include "nvidia/gptq_qyblas_gemm_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreateGptqQyblasGemmDescriptor( + infiniopHandle_t handle, + infiniopGptqQyblasGemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t b_scales_desc, + infiniopTensorDescriptor_t b_zeros_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gptq_qyblas_gemm::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, a_desc, b_desc, b_scales_desc, b_zeros_desc); + + switch (handle->device) { + +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetGptqQyblasGemmWorkspaceSize( + infiniopGptqQyblasGemmDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__INFINI_C infiniStatus_t infiniopGptqQyblasGemm( + infiniopGptqQyblasGemmDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *b_scale, + void *b_zero, + int64_t quant_type, + int64_t bit, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, out, a, b, b_scale, b_zero, quant_type, bit, stream); + + switch (desc->device_type) { + +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyGptqQyblasGemmDescriptor( + infiniopGptqQyblasGemmDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/test/infiniop/gptq_qyblas_gemm.py b/test/infiniop/gptq_qyblas_gemm.py new file mode 100644 index 000000000..44c82ab4e --- /dev/null +++ b/test/infiniop/gptq_qyblas_gemm.py @@ -0,0 +1,577 @@ +import torch +import numpy +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, +) +from enum import Enum, auto +import itertools +from libinfiniop.scalar_type import scalar_types, ScalarType +from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Union + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +# Test configurations + +BLOCK_SIZE = [[128, 128]] +M_list = [1, 7]#, 83, 512, 2048] +N_list = [128, 512]#, 1024, 4096, 7748, 13824] +K_list = [256, 4096]#, 5120, 3884, 13824] + +_WEIGHT_DTYPES = [InfiniDtype.I8] + +SEEDS = 0 + +def to_iter(x): + return x if isinstance(x, (list, tuple)) else (x,) + + +_TEST_CASES = list( + itertools.product( + to_iter(M_list), + to_iter(K_list), + to_iter(N_list), + to_iter(BLOCK_SIZE), + to_iter(_WEIGHT_DTYPES), + ) +) + +_TEST_CASES_BIT = [ + # M , K, N, group_size, bit + (128, 128, 128, 128, 4), + (32768, 3584, 4608, 128, 4), + (32768, 3584, 4608, 128, 8), +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16] + +_TENSOR_DTYPES_BIT = [InfiniDtype.BF16, InfiniDtype.F16] + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] +SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + + +def native_w8a16_block_int8_matmul( + A, + B, + Bs, + block_size, + output_dtype: torch.float16, +) -> torch.Tensor: + """Matrix multiplication with block-wise quantization using native torch.""" + A = A.to(torch.float32) + B = B.to(torch.float32) + assert A.shape[-1] == B.shape[-1] + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + M = A.numel() // A.shape[-1] + N, K = B.shape + origin_C_shape = A.shape[:-1] + (N, ) + A = A.reshape(M, A.shape[-1]) + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + assert n_tiles == Bs.shape[0] + assert k_tiles == Bs.shape[1] + + C_shape = (M, N) + C = torch.zeros(C_shape, dtype=torch.float32, device=A.device) + + A_tiles = [ + A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) + ] + B_tiles = [[ + B[j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) + ] for j in range(n_tiles)] + C_tiles = [ + C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) + ] + + for i in range(k_tiles): + for j in range(n_tiles): + a = A_tiles[i] + b = B_tiles[j][i] + c = C_tiles[j] + s = Bs[j][i] + c[:, :] += torch.matmul(a, b.t()) * s + + C = C.reshape(origin_C_shape).to(output_dtype) + return C + + +def _gguf_quantize_weights(w: torch.Tensor, + quant_type: ScalarType, + group_size: Optional[int], + zero_points: bool = False, + ref_zero_points_after_scales: bool = False, + need_weight_ref: bool = True): + assert quant_type.is_integer(), \ + "Floating point quantization may work but has not been tested" + assert not zero_points or group_size is not None, \ + "to have group zero points, group_size must be provided "\ + "(-1 group_size is channelwise)" + + orig_device = w.device + orig_type = w.dtype + size_k, size_n = w.shape + + assert w.is_floating_point(), "w must be float" + + if group_size == -1: + group_size = size_k + + # Reshape to [groupsize, -1] + if group_size is not None and group_size < size_k: + w = w.reshape((-1, group_size, size_n)) + w = w.permute(1, 0, 2) + w = w.reshape((group_size, -1)) + + # Compute scale for each group + max_val = torch.max(w, 0, keepdim=True).values + min_val = torch.min(w, 0, keepdim=True).values + + max_q_val = quant_type.max() + min_q_val = quant_type.min() + + w_s = torch.Tensor([1.0]).to(w.device) # unscaled case + maybe_w_zp = None + if group_size is not None: + if zero_points: + assert not quant_type.is_signed() and quant_type.max() > 0 + w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max() + maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \ + .clamp(min_q_val, max_q_val).int() + else: + # If the bias is such that there are no possible negative/positive + # values, set the max value to inf to avoid divide by 0 + w_s = torch.max( + abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)), + abs(min_val / (min_q_val if min_q_val != 0 else torch.inf))) + + # Quantize + w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0) + w_q = torch.clamp(w_q, min_q_val, max_q_val) + + # Compute ref (dequantized) + # For some kernels (namely Machete) the zero-points are applied after the + # scales are applied, for this case computing the reference in similar way + # allows us to use tighter error tolerances in our unit tests. + if need_weight_ref: + if ref_zero_points_after_scales and maybe_w_zp is not None: + w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s + else: + w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s + + if quant_type.has_bias(): + w_q += quant_type.bias + + # Restore original shapes + if group_size is not None and group_size < size_k: + + def reshape_w(w): + w = w.reshape((group_size, -1, size_n)) + w = w.permute(1, 0, 2) + w = w.reshape((size_k, size_n)).contiguous() + return w + + w_q = reshape_w(w_q) + if need_weight_ref: + w_ref = reshape_w(w_ref) + w_s = w_s.reshape((-1, size_n)).contiguous() + + if maybe_w_zp is not None: + maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous() + maybe_w_zp = maybe_w_zp.to(device=orig_device) + + return ( + w_ref.to(device=orig_device) if need_weight_ref else None, + w_q.to(device=orig_device), + w_s if group_size is not None else None, + maybe_w_zp, + ) + +def gguf_quantize_weights(w: torch.Tensor, + group_size: int, + zero_points: bool = False, + need_weight_ref: bool = False, + bits: int = 4, + ref_zero_points_after_scales: bool = False, + params_dtype: torch.dtype = torch.float16): + size_k, _ = w.shape + + assert w.is_floating_point(), "w must be float" + assert group_size in SUPPORTED_GROUP_SIZES + [ + size_k + ], f"Unsupported groupsize = {group_size}" + + w_ref, w_q, w_s, w_z = _gguf_quantize_weights(w, quant_type=scalar_types.uint4 if bits == 4 else scalar_types.uint8, + group_size=group_size, + zero_points=zero_points, + need_weight_ref=need_weight_ref, + ref_zero_points_after_scales=ref_zero_points_after_scales) + + if zero_points: + w_z = w_z.to(params_dtype) + + w_q = w_q.to(torch.uint8) + + return w_ref, w_q, w_s, w_z + +def gguf_linear_quantize_weights(w: torch.Tensor, + group_size: int, + zero_points: bool = False, + need_weight_ref: bool = False, + bits: int =4, + params_dtype: torch.dtype = torch.float16): + w_ref, w_q, w_s, w_z = gguf_quantize_weights( + w=w, + group_size=group_size, + zero_points=zero_points, + need_weight_ref=need_weight_ref, + bits=bits, + ref_zero_points_after_scales=False, + params_dtype=params_dtype, + ) + + if bits == 4: + w_q = (w_q[:,1::2] << 4) | w_q[:, ::2] + w_q = w_q.reshape(w_q.shape[0]//2, -1) # This step is to match the parameters of the dlblasGemmExV2 + + return w_ref, w_q, w_s, w_z + + + +def test( + handle, + device, + M, + K, + N, + block_size, + weight_dtype=InfiniDtype.I8, + dtype=InfiniDtype.BF16, + sync=None, +): + + print( + f"Testing int8 Gptq Qyblas Gemm on {InfiniDeviceNames[device]} with M-K-N:{M, K, N}, block_size:{block_size}, weight dtype:{InfiniDtypeNames[weight_dtype]}, dtype:{InfiniDtypeNames[dtype]}" + ) + quant_type = 3 + bit = 8 + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + A = TestTensor( + (M, K), + None, + dtype, + device, + ) + if weight_dtype == InfiniDtype.I8: + _info = torch.iinfo(torch.int8) + elif weight_dtype == InfiniDtype.U8: + _info = torch.iinfo(torch.uint8) + elif weight_dtype == InfiniDtype.F8: + _info = torch.iinfo(float8_e4m3fn) + B_orig = TestTensor( + (N, K), + None, + weight_dtype, + device, + randint_low=_info.min, + randint_high=_info.max, + ) + B_torch = B_orig.torch_tensor().t() + B = TestTensor( + (K, N), + B_torch.stride(), + weight_dtype, + device, + mode="manual", + set_tensor=B_torch, + ) + + b_scales = TestTensor( + (n_tiles, k_tiles), + None, + InfiniDtype.F32, + device, + ) + + b_zeros = TestTensor( + (n_tiles, k_tiles), + None, + InfiniDtype.F32, + device, + mode="zeros", + ) + + out = TestTensor( + (M, N), + None, + dtype, + device, + mode="zeros", + ) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGptqQyblasGemmDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + A.descriptor, + B.descriptor, + b_scales.descriptor, + b_zeros.descriptor, + ) + ) + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + for tensor in [out, A, B, b_scales, b_zeros]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGptqQyblasGemmWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, A.device) + + def lib_gptq_qyblas_gemm(): + check_error( + LIBINFINIOP.infiniopGptqQyblasGemm( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + A.data(), + B.data(), + b_scales.data(), + b_zeros.data(), + quant_type, + bit, + None, + ) + ) + + lib_gptq_qyblas_gemm() + + if sync is not None: + sync() + + out_dtype = to_torch_dtype(dtype) + ans = native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype) + + rel_diff = (torch.mean( + torch.abs(out.actual_tensor().to(torch.float32) - ans.to(torch.float32))) / + torch.mean(torch.abs(ans.to(torch.float32)))) + + assert rel_diff < 0.05 + + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gptq_qyblas_gemm(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor)) + + +def test_bit( + handle, + device, + M, + K, + N, + group_size, + bit, + dtype=InfiniDtype.BF16, + sync=None, +): + + print( + f"Testing Gptq Qyblas Gemm on {InfiniDeviceNames[device]} with M-K-N:{M, K, N}, group_size:{group_size}, bit:{bit}, dtype:{InfiniDtypeNames[dtype]}" + ) + quant_type = 0 + bit = 4 + + k_tiles = (K + group_size - 1) // group_size + + A = TestTensor( + (M, K), + None, + dtype, + device, + ) + B_orig = TestTensor( + (K, N), + None, + dtype, + device, + ) + w_ref, w_q, w_s, w_z = gguf_linear_quantize_weights(B_orig.torch_tensor(), + group_size=group_size, + zero_points=True, + need_weight_ref=True, + bits=bit, + params_dtype=to_torch_dtype(dtype)) + + B = TestTensor( + w_q.shape, + w_q.stride(), + InfiniDtype.U8, + device, + mode="manual", + set_tensor=w_q, + ) + + + b_scales = TestTensor( + w_s.shape, + w_s.stride(), + dtype, + device, + mode="manual", + set_tensor=w_s, + ) + + b_zeros = TestTensor( + w_z.shape, + w_z.stride(), + dtype, + device, + mode="manual", + set_tensor=w_z, + ) + + out = TestTensor( + (M, N), + None, + dtype, + device, + mode="zeros", + ) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGptqQyblasGemmDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + A.descriptor, + B.descriptor, + b_scales.descriptor, + b_zeros.descriptor, + ) + ) + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + + for tensor in [out, A, B, b_scales, b_zeros]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGptqQyblasGemmWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, A.device) + + def lib_gptq_qyblas_gemm(): + check_error( + LIBINFINIOP.infiniopGptqQyblasGemm( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + A.data(), + B.data(), + b_scales.data(), + b_zeros.data(), + quant_type, + bit, + None, + ) + ) + + lib_gptq_qyblas_gemm() + + if sync is not None: + sync() + + atol, rtol = 2e-2, 2e-2 + if bit == 8: + atol, rtol = 2e-2, 0 + else: + atol, rtol = 2e-2, 2e-2 + ans = torch.matmul(A.torch_tensor(), w_ref.to(A.torch_tensor().device)) + if DEBUG: + debug(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.matmul(A.torch_tensor(), w_ref.to(A.torch_tensor().device)), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gptq_qyblas_gemm(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + for device in get_test_devices(args): + test_operator(device, test_bit, _TEST_CASES_BIT, _TENSOR_DTYPES_BIT) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 9a91c931c..26ade4c2a 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -13,6 +13,7 @@ c_double, c_int64, c_bool, + c_int64, ) @@ -1313,6 +1314,45 @@ def per_tensor_dequant_int8_(lib): ] +@OpRegister.operator +def gptq_qyblas_gemm_(lib): + lib.infiniopCreateGptqQyblasGemmDescriptor.restype = c_int32 + lib.infiniopCreateGptqQyblasGemmDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetGptqQyblasGemmWorkspaceSize.restype = c_int32 + lib.infiniopGetGptqQyblasGemmWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGptqQyblasGemm.restype = c_int32 + lib.infiniopGptqQyblasGemm.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_int64, + c_int64, + c_void_p, + ] + + lib.infiniopDestroyGptqQyblasGemmDescriptor.restype = c_int32 + lib.infiniopDestroyGptqQyblasGemmDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + @OpRegister.operator def softplus_(lib): lib.infiniopCreateSoftplusDescriptor.restype = c_int32 diff --git a/test/infiniop/libinfiniop/scalar_type.py b/test/infiniop/libinfiniop/scalar_type.py new file mode 100644 index 000000000..af74a3c6e --- /dev/null +++ b/test/infiniop/libinfiniop/scalar_type.py @@ -0,0 +1,356 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +import struct +from dataclasses import dataclass +from enum import Enum + +_SCALAR_TYPES_ID_MAP = {} + + +# Mirrors enum in `core/scalar_type.hpp` +class NanRepr(Enum): + NONE = 0 # nans are not supported + IEEE_754 = 1 # nans are: Exp all 1s, mantissa not all 0s + EXTD_RANGE_MAX_MIN = 2 # nans are: Exp all 1s, mantissa all 1s + + +# This ScalarType class is a parallel implementation of the C++ ScalarType +# class found in csrc/core/scalar_type.hpp. These two classes should be kept +# in sync until the inductor fully supports custom C++ classes. +@dataclass(frozen=True) +class ScalarType: + """ + ScalarType can represent a wide range of floating point and integer + types, in particular it can be used to represent sub-byte data types + (something that torch.dtype currently does not support). It is also + capable of representing types with a bias, i.e.: + `stored_value = value + bias`, + this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias + of 8). The implementation for this class can be found in + csrc/core/scalar_type.hpp, these type signatures should be kept in sync + with that file. + """ + + exponent: int + """ + Number of bits in the exponent if this is a floating point type + (zero if this an integer type) + """ + + mantissa: int + """ + Number of bits in the mantissa if this is a floating point type, + or the number bits representing an integer excluding the sign bit if + this an integer type. + """ + + signed: bool + "If the type is signed (i.e. has a sign bit)" + + bias: int + """ + bias used to encode the values in this scalar type + (value = stored_value - bias, default 0) for example if we store the + type as an unsigned integer with a bias of 128 then the value 0 will be + stored as 128 and -1 will be stored as 127 and 1 will be stored as 129. + """ + + _finite_values_only: bool = False + """ + Private: if infs are supported, used `has_infs()` instead. + """ + + nan_repr: NanRepr = NanRepr.IEEE_754 + """ + How NaNs are represent in this scalar type, returns NanRepr value. + (not applicable for integer types) + """ + + def _floating_point_max_int(self) -> int: + assert self.mantissa <= 52 and self.exponent <= 11, ( + f"Cannot represent max/min as a double for type {self.__str__()}" + ) + + max_mantissa = (1 << self.mantissa) - 1 + if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN: + max_mantissa = max_mantissa - 1 + + max_exponent = (1 << self.exponent) - 2 + if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN or self.nan_repr == NanRepr.NONE: + assert self.exponent < 11, ( + f"Cannot represent max/min as a double for type {self.__str__()}" + ) + max_exponent = max_exponent + 1 + + # adjust the exponent to match that of a double + # for now we assume the exponent bias is the standard 2^(e-1) -1, (where + # e is the exponent bits), there is some precedent for non-standard + # biases, example `float8_e4m3b11fnuz` here: + # https://github.com/jax-ml/ml_dtypes but to avoid premature over + # complication we are just assuming the standard exponent bias until + # there is a need to support non-standard biases + exponent_bias = (1 << (self.exponent - 1)) - 1 + exponent_bias_double = (1 << 10) - 1 # double e = 11 + + max_exponent_double = max_exponent - exponent_bias + exponent_bias_double + + # shift the mantissa and exponent into the proper positions for an + # IEEE double and bitwise-or them together. + return (max_mantissa << (52 - self.mantissa)) | (max_exponent_double << 52) + + def _floating_point_max(self) -> float: + double_raw = self._floating_point_max_int() + return struct.unpack("!d", struct.pack("!Q", double_raw))[0] + + def _raw_max(self) -> int | float: + if self.is_floating_point(): + return self._floating_point_max() + else: + assert self.size_bits < 64 or self.size_bits == 64 and self.is_signed(), ( + "Cannot represent max as an int" + ) + return (1 << self.mantissa) - 1 + + def _raw_min(self) -> int | float: + if self.is_floating_point(): + assert self.is_signed(), ( + "We currently assume all floating point types are signed" + ) + sign_bit_double = 1 << 63 + + max_raw = self._floating_point_max_int() + min_raw = max_raw | sign_bit_double + return struct.unpack("!d", struct.pack("!Q", min_raw))[0] + else: + assert not self.is_signed() or self.size_bits <= 64, ( + "Cannot represent min as a int64_t" + ) + + if self.is_signed(): + return -(1 << (self.size_bits - 1)) + else: + return 0 + + @functools.cached_property + def id(self) -> int: + """ + Convert the ScalarType to an int which can be passed to pytorch custom + ops. This layout of the int must be kept in sync with the C++ + ScalarType's from_id method. + """ + val = 0 + offset = 0 + + def or_and_advance(member, bit_width): + nonlocal val + nonlocal offset + bit_mask = (1 << bit_width) - 1 + val = val | (int(member) & bit_mask) << offset + offset = offset + bit_width + + or_and_advance(self.exponent, 8) + or_and_advance(self.mantissa, 8) + or_and_advance(self.signed, 1) + or_and_advance(self.bias, 32) + or_and_advance(self._finite_values_only, 1) + or_and_advance(self.nan_repr.value, 8) + + assert offset <= 64, f"ScalarType fields too big {offset} to fit into an int64" + + _SCALAR_TYPES_ID_MAP[val] = self + + return val + + @property + def size_bits(self) -> int: + return self.exponent + self.mantissa + int(self.signed) + + def min(self) -> int | float: + """ + Min representable value for this scalar type. + (accounting for bias if there is one) + """ + return self._raw_min() - self.bias + + def max(self) -> int | float: + """ + Max representable value for this scalar type. + (accounting for bias if there is one) + """ + return self._raw_max() - self.bias + + def is_signed(self) -> bool: + """ + If the type is signed (i.e. has a sign bit), same as `signed` + added for consistency with: + https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html + """ + return self.signed + + def is_floating_point(self) -> bool: + "If the type is a floating point type" + return self.exponent != 0 + + def is_integer(self) -> bool: + "If the type is an integer type" + return self.exponent == 0 + + def has_bias(self) -> bool: + "If the type has a non-zero bias" + return self.bias != 0 + + def has_infs(self) -> bool: + "If the type is floating point and supports infinity" + return not self._finite_values_only + + def has_nans(self) -> bool: + return self.nan_repr != NanRepr.NONE.value + + def is_ieee_754(self) -> bool: + """ + If the type is a floating point type that follows IEEE 754 + conventions + """ + return self.nan_repr == NanRepr.IEEE_754.value and not self._finite_values_only + + def __str__(self) -> str: + """ + naming generally follows: https://github.com/jax-ml/ml_dtypes + for floating point types (leading f) the scheme is: + `float_em[flags]` + flags: + - no-flags: means it follows IEEE 754 conventions + - f: means finite values only (no infinities) + - n: means nans are supported (non-standard encoding) + for integer types the scheme is: + `[u]int[b]` + - if bias is not present it means its zero + """ + if self.is_floating_point(): + ret = ( + "float" + + str(self.size_bits) + + "_e" + + str(self.exponent) + + "m" + + str(self.mantissa) + ) + + if not self.is_ieee_754(): + if self._finite_values_only: + ret = ret + "f" + if self.nan_repr != NanRepr.NONE: + ret = ret + "n" + + return ret + else: + ret = ("int" if self.is_signed() else "uint") + str(self.size_bits) + if self.has_bias(): + ret = ret + "b" + str(self.bias) + return ret + + def __repr__(self) -> str: + return "ScalarType." + self.__str__() + + # __len__ needs to be defined (and has to throw TypeError) for pytorch's + # opcheck to work. + def __len__(self) -> int: + raise TypeError + + # + # Convenience Constructors + # + + @classmethod + def int_(cls, size_bits: int, bias: int | None) -> "ScalarType": + "Create a signed integer scalar type (size_bits includes sign-bit)." + ret = cls(0, size_bits - 1, True, bias if bias else 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def uint(cls, size_bits: int, bias: int | None) -> "ScalarType": + """Create an unsigned integer scalar type.""" + ret = cls(0, size_bits, False, bias if bias else 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def float_IEEE754(cls, exponent: int, mantissa: int) -> "ScalarType": + """ + Create a standard floating point type + (i.e. follows IEEE 754 conventions). + """ + assert mantissa > 0 and exponent > 0 + ret = cls(exponent, mantissa, True, 0) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def float_( + cls, exponent: int, mantissa: int, finite_values_only: bool, nan_repr: NanRepr + ) -> "ScalarType": + """ + Create a non-standard floating point type + (i.e. does not follow IEEE 754 conventions). + """ + assert mantissa > 0 and exponent > 0 + assert nan_repr != NanRepr.IEEE_754, ( + "use `float_IEEE754` constructor for floating point types that " + "follow IEEE 754 conventions" + ) + ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr) + ret.id # noqa B018: make sure the id is cached + return ret + + @classmethod + def from_id(cls, scalar_type_id: int): + if scalar_type_id not in _SCALAR_TYPES_ID_MAP: + raise ValueError(f"scalar_type_id {scalar_type_id} doesn't exists.") + return _SCALAR_TYPES_ID_MAP[scalar_type_id] + + +# naming generally follows: https://github.com/jax-ml/ml_dtypes +# for floating point types (leading f) the scheme is: +# `float_em[flags]` +# flags: +# - no-flags: means it follows IEEE 754 conventions +# - f: means finite values only (no infinities) +# - n: means nans are supported (non-standard encoding) +# for integer types the scheme is: +# `[u]int[b]` +# - if bias is not present it means its zero + + +class scalar_types: + int4 = ScalarType.int_(4, None) + uint4 = ScalarType.uint(4, None) + int8 = ScalarType.int_(8, None) + uint8 = ScalarType.uint(8, None) + float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN) + float8_e5m2 = ScalarType.float_IEEE754(5, 2) + float8_e8m0fnu = ScalarType(8, 0, False, 0, True, NanRepr.EXTD_RANGE_MAX_MIN) + float16_e8m7 = ScalarType.float_IEEE754(8, 7) + float16_e5m10 = ScalarType.float_IEEE754(5, 10) + + # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main + # and https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE) + + float6_e2m3f = ScalarType.float_(2, 3, True, NanRepr.NONE) + + # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE) + + # "gptq" types + uint2b2 = ScalarType.uint(2, 2) + uint3b4 = ScalarType.uint(3, 4) + uint4b8 = ScalarType.uint(4, 8) + uint8b128 = ScalarType.uint(8, 128) + + # colloquial names + bfloat16 = float16_e8m7 + float16 = float16_e5m10 + diff --git a/xmake.lua b/xmake.lua index 8f32bf7cc..c69e0170d 100644 --- a/xmake.lua +++ b/xmake.lua @@ -362,6 +362,9 @@ target("infiniop") add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true}) add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/nvidia/*.cu.o", {public = true}) add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true}) + add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/qy/*.cu.o", {public = true}) + add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/qy/*.cu.o", {public = true}) + add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/qy/*.cu.o", {public = true}) end if has_config("cambricon-mlu") then diff --git a/xmake/qy.lua b/xmake/qy.lua index a2fe269b8..31b65c33c 100644 --- a/xmake/qy.lua +++ b/xmake/qy.lua @@ -142,6 +142,7 @@ target("infiniop-qy") set_languages("cxx17") add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu") + add_files("../src/infiniop/devices/qy/*.cu", "../src/infiniop/ops/*/qy/*.cu", "../src/infiniop/ops/*/*/qy/*.cu") if has_config("ninetoothed") then add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")