diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp
index 89a832ceb..81e9944c0 100644
--- a/include/infinicore/nn/linear.hpp
+++ b/include/infinicore/nn/linear.hpp
@@ -34,6 +34,7 @@ class BaseLinear : public Module {
     Tensor bias() const { return bias_; }
     Tensor weight_scale() const { return weight_scale_; }
     Tensor weight_zeros() const { return weight_zeros_; }
+    Tensor gidx() const { return gidx_; }
 
     std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization() const { return quantization_; }
 
@@ -45,6 +46,8 @@ class BaseLinear : public Module {
     INFINICORE_NN_PARAMETER(weight_scale);
     INFINICORE_NN_PARAMETER(weight_zeros);
 
+    INFINICORE_NN_PARAMETER(gidx);
+
 protected:
     // Helper method for common forward computation
     Tensor compute_linear(Tensor &input) const;
diff --git a/include/infinicore/ops/linear_w4a16_gptq_qy.hpp b/include/infinicore/ops/linear_w4a16_gptq_qy.hpp
new file mode 100644
index 000000000..72cc9eef4
--- /dev/null
+++ b/include/infinicore/ops/linear_w4a16_gptq_qy.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+Tensor linear_w4a16_gptq_qy(Tensor in, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit);
+
+void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp b/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp
new file mode 100644
index 000000000..33ee7c37b
--- /dev/null
+++ b/include/infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(GptqQyblasGemm, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t);
+
+void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit);
+} // namespace infinicore::op
diff --git a/include/infinicore/quantization.hpp b/include/infinicore/quantization.hpp
index 7b01312ba..61adaadfa 100644
--- a/include/infinicore/quantization.hpp
+++ b/include/infinicore/quantization.hpp
@@ -3,5 +3,6 @@
 #include "quantization/awq.hpp"
 #include "quantization/base_quantization.hpp"
 #include "quantization/compressed_tensors.hpp"
+#include "quantization/gptq_qy.hpp"
 #include "quantization/none_quantizaiton.hpp"
 #include "quantization/quantization_scheme.hpp"
diff --git a/include/infinicore/quantization/base_quantization.hpp b/include/infinicore/quantization/base_quantization.hpp
index 53f97deef..38f45f316 100644
--- a/include/infinicore/quantization/base_quantization.hpp
+++ b/include/infinicore/quantization/base_quantization.hpp
@@ -6,7 +6,7 @@ namespace infinicore::quantization {
 class BaseQuantization {
     // Base class for quantization schemes. Intended to be extended to support various quantization methods.
 public:
-    explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config){};
+    explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
     virtual ~BaseQuantization() = default;
 
     virtual infinicore::quantization::QuantScheme get_quant_scheme() const = 0;
diff --git a/include/infinicore/quantization/gptq_qy.hpp b/include/infinicore/quantization/gptq_qy.hpp
new file mode 100644
index 000000000..d1712b257
--- /dev/null
+++ b/include/infinicore/quantization/gptq_qy.hpp
@@ -0,0 +1,30 @@
+#pragma once
+#include "base_quantization.hpp"
+namespace infinicore::quantization {
+
+class GPTQ_QY : public BaseQuantization {
+    // This is a temporary class that currently only returns GPTQ W4A16.
+    // Future enhancements should parse quant_config to extract detailed quantization
+    // information and support multiple quantization schemes.
+public:
+    explicit GPTQ_QY(const nlohmann::json &quant_config)
+        : BaseQuantization(quant_config) {};
+
+    infinicore::quantization::QuantScheme
+    get_quant_scheme() const override {
+        return infinicore::quantization::QuantScheme::GPTQ_W4A16_QY;
+    };
+
+    int get_packing_num() const {
+        // For GPTQ, we pack 8 int4 weights into a single int32 value.
+        return 32 / this->get_or<int>("bits", 4); // Default to 8 if not specified in config
+    }
+
+    int get_group_size() const {
+        // For simplicity, we return a fixed group size here. In a more complete implementation,
+        // this could be extracted from quant_config_ to support different group sizes.
+        return this->get_or<int>("group_size", 128); // Standard GPTQ group size
+    }
+};
+
+} // namespace infinicore::quantization
diff --git a/include/infinicore/quantization/quantization_scheme.hpp b/include/infinicore/quantization/quantization_scheme.hpp
index 9c08ea6e0..f2c98321f 100644
--- a/include/infinicore/quantization/quantization_scheme.hpp
+++ b/include/infinicore/quantization/quantization_scheme.hpp
@@ -7,6 +7,7 @@ enum class QuantScheme {
     NONE,
     COMPRESSED_TENSOR_W8A8I8,
     AWQ_W4A16,
+    GPTQ_W4A16_QY,
 };
 
 enum class KVQuantAlgo {
diff --git a/include/infiniop.h b/include/infiniop.h
index f38fc9744..aeefe952a 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -48,6 +48,7 @@
 #include "infiniop/ops/fmod.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/gptq_qyblas_gemm.h"
 #include "infiniop/ops/hardswish.h"
 #include "infiniop/ops/hardtanh.h"
 #include "infiniop/ops/hinge_embedding_loss.h"
diff --git a/include/infiniop/ops/gemm.h b/include/infiniop/ops/gemm.h
index 3d4d4dc0c..430e37003 100644
--- a/include/infiniop/ops/gemm.h
+++ b/include/infiniop/ops/gemm.h
@@ -6,22 +6,22 @@
 typedef struct InfiniopDescriptor *infiniopGemmDescriptor_t;
 
 __INFINI_C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle,
-                                                         infiniopGemmDescriptor_t *desc_ptr,
-                                                         infiniopTensorDescriptor_t c_desc,
-                                                         infiniopTensorDescriptor_t a_desc,
-                                                         infiniopTensorDescriptor_t b_desc);
+                                                                infiniopGemmDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t c_desc,
+                                                                infiniopTensorDescriptor_t a_desc,
+                                                                infiniopTensorDescriptor_t b_desc);
 
 __INFINI_C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size);
 
 __INFINI_C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
-                                         void *workspace,
-                                         size_t workspace_size,
-                                         void *c,
-                                         void const *a,
-                                         void const *b,
-                                         float alpha,
-                                         float beta,
-                                         void *stream);
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *c,
+                                                void const *a,
+                                                void const *b,
+                                                float alpha,
+                                                float beta,
+                                                void *stream);
 
 __INFINI_C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc);
 
diff --git a/include/infiniop/ops/gptq_qyblas_gemm.h b/include/infiniop/ops/gptq_qyblas_gemm.h
new file mode 100644
index 000000000..bb105132c
--- /dev/null
+++ b/include/infiniop/ops/gptq_qyblas_gemm.h
@@ -0,0 +1,37 @@
+#ifndef __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__
+#define __INFINIOP_GPTQ_QYBLAS_GEMM_API_H__
+
+#include "../operator_descriptor.h"
+#include <cstdint>
+
+typedef struct InfiniopDescriptor *infiniopGptqQyblasGemmDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateGptqQyblasGemmDescriptor(
+    infiniopHandle_t handle,
+    infiniopGptqQyblasGemmDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t b_scales_desc,
+    infiniopTensorDescriptor_t b_zeros_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetGptqQyblasGemmWorkspaceSize(
+    infiniopGptqQyblasGemmDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopGptqQyblasGemm(
+    infiniopGptqQyblasGemmDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *a,
+    const void *b,
+    void *b_scale,
+    void *b_zero,
+    int64_t quant_type,
+    int64_t bit,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyGptqQyblasGemmDescriptor(
+    infiniopGptqQyblasGemmDescriptor_t desc);
+#endif
diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc
index 05342e651..ebe607bab 100644
--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
@@ -4,6 +4,7 @@
 #include "infinicore/ops/distributed/allreduce.hpp"
 #include "infinicore/ops/linear.hpp"
 #include "infinicore/ops/linear_w4a16_awq.hpp"
+#include "infinicore/ops/linear_w4a16_gptq_qy.hpp"
 #include "infinicore/ops/linear_w8a8i8.hpp"
 #include <optional>
 #include <spdlog/spdlog.h>
@@ -53,6 +54,19 @@ Tensor BaseLinear::compute_linear(Tensor &input) const {
         auto output = infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt);
         return output;
     }
+    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
+        Tensor input_contiguous = input->is_contiguous() ? input : input->contiguous();
+        Tensor qweight = static_cast<const Tensor &>(weight_);
+        Tensor qzeros = static_cast<const Tensor &>(weight_zeros_);
+        Tensor scales = static_cast<const Tensor &>(weight_scale_);
+        Tensor g_idx = static_cast<const Tensor &>(gidx_);
+        std::optional<Tensor> bias_opt = has_bias_ ? std::make_optional<Tensor>(static_cast<const Tensor &>(bias_)) : std::nullopt;
+        auto output = infinicore::op::linear_w4a16_gptq_qy(input_contiguous->contiguous(), qweight, qzeros, scales, 0, 4);
+        if (bias_opt.has_value()) {
+            infinicore::op::add_(output, output, bias_opt.value()->as_strided(output->shape(), {0, 0, 1}));
+        }
+        return output;
+    }
     default: {
         // Ensure input is contiguous before creating views (required for matmul)
         // This prevents hanging when input tensor has non-contiguous memory layout
@@ -140,6 +154,23 @@ Linear::Linear(size_t in_features, size_t out_features,
         }
         break;
     }
+    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
+        weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device);
+        this->register_parameter("qweight", weight_);
+        weight_zeros_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device);
+        this->register_parameter("qzeros", weight_zeros_);
+        weight_scale_ = infinicore::nn::Parameter({in_features / 128, out_features}, dtype_, device);
+        this->register_parameter("scales", weight_scale_);
+
+        gidx_ = infinicore::nn::Parameter({in_features}, infinicore::DataType::I32, device);
+        this->register_parameter("g_idx", gidx_);
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
     default: {
         // Initialize parameters using macro
         INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device));
@@ -247,6 +278,27 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
         }
         break;
     }
+    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
+        auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_);
+        int group_size = gptq_ptr->get_group_size();
+        int packing_num = gptq_ptr->get_packing_num();
+        weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 1, tp_rank_, tp_size_);
+        this->register_parameter("qweight", weight_);
+        weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_);
+        this->register_parameter("qzeros", weight_zeros_);
+        weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 1, tp_rank_, tp_size_);
+        this->register_parameter("scales", weight_scale_);
+        gidx_ = infinicore::nn::Parameter({in_features},
+                                          infinicore::DataType::I32,
+                                          device, 0, tp_rank_, tp_size_);
+        this->register_parameter("g_idx", gidx_);
+        if (bias) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, tp_rank_, tp_size_));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
     default: {
         // Initialize parameters using macro
         INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
@@ -356,6 +408,34 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, st
         }
         break;
     }
+    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
+        // GPTQ W4A16 QY for RowParallelLinear:切分维度为 in_features（权重矩阵的第1维）
+        // - Weight: packed int4 in U8 containers (8 int4 per U8)
+        // - Group-wise quantization with group_size=128
+        // - Scale and zero points stored per group along in_features dimension
+
+        auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(this->quantization_);
+        int group_size = gptq_ptr->get_group_size();
+        int packing_num = gptq_ptr->get_packing_num();
+
+        weight_ = infinicore::nn::Parameter({in_features / 2, out_features}, infinicore::DataType::U8, device, 0, tp_rank_, tp_size_);
+        this->register_parameter("qweight", weight_);
+        weight_zeros_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_);
+        this->register_parameter("qzeros", weight_zeros_);
+        weight_scale_ = infinicore::nn::Parameter({in_features / group_size, out_features}, dtype_, device, 0, tp_rank_, tp_size_);
+        this->register_parameter("scales", weight_scale_);
+
+        gidx_ = infinicore::nn::Parameter({in_features},
+                                          infinicore::DataType::I32,
+                                          device, 0, tp_rank_, tp_size_);
+        this->register_parameter("g_idx", gidx_);
+        if (bias && (0 == tp_rank_)) {
+            INFINICORE_NN_PARAMETER_INIT(bias, ({out_features}, dtype_, device, 0, 0, 1));
+        } else {
+            bias_ = Parameter();
+        }
+        break;
+    }
     default: {
         // Initialize parameters using macro
         INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, dtype_, device,
diff --git a/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc b/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc
new file mode 100644
index 000000000..bb088d709
--- /dev/null
+++ b/src/infinicore/ops/linear_w4a16_gptq_qy/linear_w4a16_gptq_qy.cc
@@ -0,0 +1,54 @@
+#include "infinicore/ops/linear_w4a16_gptq_qy.hpp"
+#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp"
+#include <iostream>
+namespace infinicore::op {
+
+Tensor linear_w4a16_gptq_qy(Tensor input, Tensor qweight, Tensor qzeros, Tensor scales, int64_t quant_type, int64_t bit) {
+
+    Size ndim = input->ndim();
+
+    Size out_features = qweight->shape()[1];
+
+    // 2. 计算输出形状 [..., out_features]
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+
+    // 3. 分配输出显存
+    auto out = Tensor::zeros(output_shape, input->dtype(), input->device());
+
+    // 4. 执行计算
+    linear_w4a16_gptq_qy_(out, input, qweight, scales, qzeros, quant_type, bit);
+
+    return out;
+}
+
+void linear_w4a16_gptq_qy_(Tensor out, Tensor in, Tensor qweights, Tensor scales, Tensor qzeros, int64_t quant_type, int64_t bit) {
+
+    Size in_features = qweights->shape()[0] * 2; // ✅ 修正：第 0 维是 in/2
+    Size out_features = qweights->shape()[1];    // ✅ 修正：第 1 维是 out
+
+    // 检查输入输出维度
+    Size ndim = in->ndim();
+
+    // ========================================================================
+    // 合并 Batch 维度
+    // ========================================================================
+    Size N = 1;
+    auto input_shape = in->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    op::scaled_mm_w4a16_gptq_qy_(
+        out->view({N, out_features}), // Output: [N, out]
+        in->view({N, in_features}),   // Input:  [N, in]
+        qweights,                     // Weight: [in/2, out]
+        scales,                       // Scales: [in/group, out]
+        qzeros,                       // QZeros: [in/group, out]
+        quant_type,                   // Quantization type
+        bit                           // Bit width
+    );
+    // out->debug();
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc
new file mode 100644
index 000000000..3afb715f0
--- /dev/null
+++ b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy.cc
@@ -0,0 +1,23 @@
+#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp"
+#include "../../utils.hpp"
+#include <iostream>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(GptqQyblasGemm);
+
+GptqQyblasGemm::GptqQyblasGemm(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, in, qweight, scales, qzeros);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, in, qweight, scales, qzeros, quant_type, bit);
+}
+
+void GptqQyblasGemm::execute(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(GptqQyblasGemm, out, in, qweight, scales, qzeros, quant_type, bit);
+}
+
+void scaled_mm_w4a16_gptq_qy_(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
+
+    GptqQyblasGemm::execute(out, in, qweight, scales, qzeros, quant_type, bit);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc
new file mode 100644
index 000000000..5ead44772
--- /dev/null
+++ b/src/infinicore/ops/scaled_mm_w4a16_gptq_qy/scaled_mm_w4a16_gptq_qy_infiniop.cc
@@ -0,0 +1,59 @@
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/scaled_mm_w4a16_gptq_qy.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::scaled_mm_w4a16_gptq_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, GptqQyblasGemm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, out, in, qweight, scales, qzeros;
+    int64_t quant_type, bit;
+};
+
+void *plan(Tensor out, const Tensor &in, const Tensor &qweight, const Tensor &scales, const Tensor &qzeros, int64_t quant_type, int64_t bit) {
+    size_t seed = hash_combine(out, in, qweight, scales, qzeros);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, GptqQyblasGemm,
+        seed,
+        out->desc(), in->desc(), qweight->desc(), scales->desc(), qzeros->desc());
+    INFINIOP_WORKSPACE_TENSOR(workspace, GptqQyblasGemm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(in),
+        graph::GraphTensor(qweight),
+        graph::GraphTensor(scales),
+        graph::GraphTensor(qzeros),
+        quant_type, bit};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+    INFINICORE_CHECK_ERROR(infiniopGptqQyblasGemm(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->out->data(),
+        planned->in->data(),
+        planned->qweight->data(),
+        planned->scales->data(),
+        planned->qzeros->data(),
+        planned->quant_type, planned->bit,
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(GptqQyblasGemm, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::scaled_mm_w4a16_gptq_impl::infiniop
diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc
index b57b00a52..9b4e97777 100644
--- a/src/infinicore/tensor/debug.cc
+++ b/src/infinicore/tensor/debug.cc
@@ -109,6 +109,20 @@ void print_data_i8(const int8_t *data, const Shape &shape, const Strides &stride
     }
 }
 
+// Function for printing U8 data
+void print_data_u8(const uint8_t *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << static_cast<int>(data[i * strides[dim]]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data_u8(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
 // Template function for writing data recursively to binary file (handles non-contiguous tensors)
 template <typename T>
 void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) {
@@ -191,8 +205,8 @@ void TensorImpl::debug(const std::string &filename) const {
                    cpu_tensor->shape(), cpu_tensor->strides(), 0);
         break;
     case DataType::U8:
-        print_data(reinterpret_cast<const uint8_t *>(cpu_data),
-                   cpu_tensor->shape(), cpu_tensor->strides(), 0);
+        print_data_u8(reinterpret_cast<const uint8_t *>(cpu_data),
+                      cpu_tensor->shape(), cpu_tensor->strides(), 0);
         break;
     case DataType::I8:
         print_data_i8(reinterpret_cast<const int8_t *>(cpu_data),
diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc
index ac4e01e83..81d9cb066 100644
--- a/src/infiniop/ops/gemm/operator.cc
+++ b/src/infiniop/ops/gemm/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/gemm_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
 #include "nvidia/gemm_nvidia.cuh"
 #endif
 #ifdef ENABLE_CAMBRICON_API
@@ -23,7 +23,9 @@
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/gemm_kunlun.h"
 #endif
-
+#ifdef ENABLE_QY_API
+#include "qy/gemm_qy.cuh"
+#endif
 __INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(
     infiniopHandle_t handle,
     infiniopGemmDescriptor_t *desc_ptr,
@@ -55,7 +57,7 @@ __INFINI_C infiniStatus_t infiniopCreateGemmDescriptor(
         CREATE(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
+        CREATE(INFINI_DEVICE_QY, qy);
 #endif
 #ifdef ENABLE_HYGON_API
         CREATE(INFINI_DEVICE_HYGON, nvidia);
@@ -109,7 +111,7 @@ infiniopGetGemmWorkspaceSize(
         GET(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
+        GET(INFINI_DEVICE_QY, qy);
 #endif
 #ifdef ENABLE_HYGON_API
         GET(INFINI_DEVICE_HYGON, nvidia);
@@ -170,7 +172,7 @@ __INFINI_C infiniStatus_t infiniopGemm(
         CALCULATE(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
+        CALCULATE(INFINI_DEVICE_QY, qy);
 #endif
 #ifdef ENABLE_HYGON_API
         CALCULATE(INFINI_DEVICE_HYGON, nvidia);
@@ -221,7 +223,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
         DELETE(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
+        DELETE(INFINI_DEVICE_QY, qy);
 #endif
 #ifdef ENABLE_HYGON_API
         DELETE(INFINI_DEVICE_HYGON, nvidia);
diff --git a/src/infiniop/ops/gemm/qy/gemm_qy.cu b/src/infiniop/ops/gemm/qy/gemm_qy.cu
new file mode 100644
index 000000000..34241a34f
--- /dev/null
+++ b/src/infiniop/ops/gemm/qy/gemm_qy.cu
@@ -0,0 +1,134 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "gemm_qy.cuh"
+
+namespace op::gemm::qy {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = c_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), 0,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    float beta,
+    const void *a,
+    const void *b,
+    float alpha,
+    void *stream) const {
+
+    cudaDataType a_type, b_type, c_type;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+    cudaDataType compute_type;
+#else
+    cublasComputeType_t compute_type;
+#endif
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        a_type = b_type = c_type = CUDA_R_16F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        compute_type = CUDA_R_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        break;
+    case INFINI_DTYPE_BF16:
+        a_type = b_type = c_type = CUDA_R_16BF;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        compute_type = CUDA_R_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        break;
+    case INFINI_DTYPE_F32:
+        a_type = b_type = c_type = CUDA_R_32F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        compute_type = CUDA_R_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+#endif
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (_info.is_transed) {
+        std::swap(a, b);
+    }
+
+    auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+    CHECK_STATUS(_opaque->internal->useCublas(
+        (cudaStream_t)stream,
+        [&](cublasHandle_t handle) {
+            // 1. 获取batch数和各矩阵的元素大小（字节）
+            const int batch = static_cast<int>(_info.batch);
+            const size_t a_elem_size = 2; // getCudaDataTypeSize(a_type);
+            const size_t b_elem_size = 2; // getCudaDataTypeSize(b_type);
+            const size_t c_elem_size = 2; // getCudaDataTypeSize(c_type);
+
+            // 2. 循环处理每个batch
+            for (int i = 0; i < batch; ++i) {
+                // 计算当前batch的A/B/C指针（stride是元素步长，转字节步长）
+                const void *a_batch = (const char *)a + i * _info.a_matrix.stride * a_elem_size;
+                const void *b_batch = (const char *)b + i * _info.b_matrix.stride * b_elem_size;
+                void *c_batch = (char *)c + i * _info.c_matrix.stride * c_elem_size;
+
+                // 3. 调用单batch的cublasGemmEx（参数与原接口完全对齐）
+                CHECK_CUBLAS(
+                    cublasGemmEx(
+                        handle,
+                        op_a,                                  // 原op_a
+                        op_b,                                  // 原op_b
+                        static_cast<int>(_info.m),             // 原m
+                        static_cast<int>(_info.n),             // 原n
+                        static_cast<int>(_info.k),             // 原k
+                        &alpha,                                // 原alpha
+                        a_batch,                               // 当前batch的A指针
+                        a_type,                                // 原a_type
+                        static_cast<int>(_info.a_matrix.ld()), // 原a的ld
+                        b_batch,                               // 当前batch的B指针
+                        b_type,                                // 原b_type
+                        static_cast<int>(_info.b_matrix.ld()), // 原b的ld
+                        &beta,                                 // 原beta
+                        c_batch,                               // 当前batch的C指针
+                        c_type,                                // 原c_type
+                        static_cast<int>(_info.c_matrix.ld()), // 原c的ld
+                        compute_type,                          // 原compute_type
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP          // 原算法选择
+                        ));
+            }
+            return INFINI_STATUS_SUCCESS;
+        }));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::gemm::qy
diff --git a/src/infiniop/ops/gemm/qy/gemm_qy.cuh b/src/infiniop/ops/gemm/qy/gemm_qy.cuh
new file mode 100644
index 000000000..0b218bac9
--- /dev/null
+++ b/src/infiniop/ops/gemm/qy/gemm_qy.cuh
@@ -0,0 +1,8 @@
+#ifndef __GEMM_QY_CUH__
+#define __GEMM_QY_CUH__
+
+#include "../gemm.h"
+
+DESCRIPTOR(qy)
+
+#endif // __GEMM_QY_CUH__
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h b/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h
new file mode 100644
index 000000000..456e540e1
--- /dev/null
+++ b/src/infiniop/ops/gptq_qyblas_gemm/gptq_qyblas_gemm.h
@@ -0,0 +1,49 @@
+#ifndef GPTQ_QYBLAS_GEMM_H
+#define GPTQ_QYBLAS_GEMM_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                           \
+                                                                                                        \
+    namespace op::gptq_qyblas_gemm::NAMESPACE {                                                         \
+    class Descriptor final : public InfiniopDescriptor {                                                \
+        struct Opaque;                                                                                  \
+        Opaque *_opaque;                                                                                \
+        GptqQyblasGemmInfo _info;                                                                       \
+        size_t _workspace_size;                                                                         \
+                                                                                                        \
+        Descriptor(                                                                                     \
+            Opaque *opaque,                                                                             \
+            GptqQyblasGemmInfo info,                                                                    \
+            size_t workspace_size,                                                                      \
+            infiniDevice_t device_type,                                                                 \
+            int device_id)                                                                              \
+            : InfiniopDescriptor{device_type, device_id},                                               \
+              _opaque(opaque),                                                                          \
+              _info(info),                                                                              \
+              _workspace_size(workspace_size) {}                                                        \
+                                                                                                        \
+    public:                                                                                             \
+        ~Descriptor();                                                                                  \
+                                                                                                        \
+        size_t workspaceSize() const { return _workspace_size; }                                        \
+                                                                                                        \
+        static infiniStatus_t create(                                                                   \
+            infiniopHandle_t handle,                                                                    \
+            Descriptor **desc_ptr,                                                                      \
+            infiniopTensorDescriptor_t out_desc,                                                        \
+            infiniopTensorDescriptor_t a_desc,                                                          \
+            infiniopTensorDescriptor_t b_desc,                                                          \
+            infiniopTensorDescriptor_t b_scales_desc,                                                   \
+            infiniopTensorDescriptor_t b_zeros_desc);                                                   \
+                                                                                                        \
+        infiniStatus_t calculate(                                                                       \
+            void *workspace, size_t workspace_size,                                                     \
+            void *out,                                                                                  \
+            const void *a, const void *b, void *b_scale, void *b_zero, int64_t quant_type, int64_t bit, \
+            void *stream) const;                                                                        \
+    };                                                                                                  \
+    }
+
+#endif // GPTQ_QYBLAS_GEMM_H
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/info.h b/src/infiniop/ops/gptq_qyblas_gemm/info.h
new file mode 100644
index 000000000..24820d53b
--- /dev/null
+++ b/src/infiniop/ops/gptq_qyblas_gemm/info.h
@@ -0,0 +1,117 @@
+#ifndef __GPTQ_QYBLAS_GEMM_INFO_H__
+#define __GPTQ_QYBLAS_GEMM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <optional>
+#include <vector>
+
+inline void prepare_matrix_for_cublas(
+    infiniopTensorDescriptor_t tensor,
+    bool &transpose_tensor) {
+
+    auto strides = tensor->strides();
+    auto sizes = tensor->shape();
+
+    if ((strides[0] == 1) && (strides[1] >= std::max<int64_t>(1, sizes[0]))) {
+
+        transpose_tensor = false;
+        return;
+    }
+    if ((strides[1] == 1) && (strides[0] >= std::max<int64_t>(1, sizes[1]))) {
+
+        transpose_tensor = true;
+        return;
+    }
+    transpose_tensor = true;
+}
+
+namespace op::gptq_qyblas_gemm {
+
+class GptqQyblasGemmInfo {
+    GptqQyblasGemmInfo() = default;
+
+public:
+    infiniDtype_t dtype, weight_dtype, scales_dtype, zeros_dtype, out_dtype;
+    size_t M, K, N, scales_size_0, scales_size_1;
+    ptrdiff_t lda, ldb, result_ld;
+    bool transpose_result;
+    char transa, transb;
+
+    static utils::Result<GptqQyblasGemmInfo> createGptqQyblasGemmInfo(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc,
+        infiniopTensorDescriptor_t b_scales_desc,
+        infiniopTensorDescriptor_t b_zeros_desc) {
+
+        auto dtype = a_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+        auto out_dtype = out_desc->dtype();
+        CHECK_DTYPE(dtype, out_dtype);
+
+        const infiniDtype_t weight_dtype = b_desc->dtype();
+        // CHECK_DTYPE(weight_dtype, INFINI_DTYPE_F8, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
+
+        const infiniDtype_t scales_dtype = b_scales_desc->dtype();
+        const infiniDtype_t zeros_dtype = b_zeros_desc->dtype();
+
+        bool transpose_result = false;
+        bool transpose_mat_1 = false;
+        bool transpose_mat_2 = false;
+
+        prepare_matrix_for_cublas(out_desc, transpose_result);
+
+        auto mata = (transpose_result ? b_desc : a_desc);
+        prepare_matrix_for_cublas(transpose_result ? b_desc : a_desc, transpose_mat_1);
+        auto matb = (transpose_result ? a_desc : b_desc);
+        prepare_matrix_for_cublas(transpose_result ? a_desc : b_desc, transpose_mat_2);
+
+        auto mat1_sizes = a_desc->shape();
+        auto mat2_sizes = b_desc->shape();
+        if (transpose_result) {
+            transpose_mat_1 = !transpose_mat_1;
+            transpose_mat_2 = !transpose_mat_2;
+            mat1_sizes = mata->shape();
+            mat2_sizes = matb->shape();
+        }
+
+        size_t M = mat1_sizes[transpose_result ? 1 : 0];
+        size_t K = mat1_sizes[transpose_result ? 0 : 1];
+        size_t N = mat2_sizes[transpose_result ? 0 : 1];
+
+        size_t scales_size_0 = b_scales_desc->shape()[0];
+        size_t scales_size_1 = b_scales_desc->shape()[1];
+
+        auto ndim = out_desc->ndim();
+        CHECK_OR_RETURN(ndim == 2
+                            && a_desc->ndim() == ndim
+                            && b_desc->ndim() == ndim
+                            && b_scales_desc->ndim() == ndim
+                            && b_zeros_desc->ndim() == ndim,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        ptrdiff_t lda = mata->strides()[(transpose_mat_1 == transpose_result)
+                                            ? 1
+                                            : 0];
+        ptrdiff_t ldb = matb->strides()[(transpose_mat_2 == transpose_result)
+                                            ? 1
+                                            : 0];
+        ptrdiff_t result_ld = out_desc->strides()[transpose_result ? 0 : 1];
+
+        char transa = transpose_mat_1 ? 't' : 'n';
+        char transb = transpose_mat_2 ? 't' : 'n';
+
+        return utils::Result<GptqQyblasGemmInfo>(GptqQyblasGemmInfo{
+            dtype, weight_dtype, scales_dtype, zeros_dtype, out_dtype,
+            M, K, N, scales_size_0, scales_size_1,
+            lda, ldb, result_ld,
+            transpose_result,
+            transa, transb});
+    }
+};
+
+} // namespace op::gptq_qyblas_gemm
+
+#endif // __GPTQ_QYBLAS_GEMM_INFO_H__
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu
new file mode 100644
index 000000000..9aab07aff
--- /dev/null
+++ b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu
@@ -0,0 +1,198 @@
+#if defined ENABLE_QY_API
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "dlblas_ext.h"
+#include "gptq_qyblas_gemm_nvidia.cuh"
+
+inline cudaDataType_t ScalarTypeToCudaDataType(
+    infiniDtype_t scalar_type) {
+    switch (scalar_type) {
+    case INFINI_DTYPE_U8:
+        return CUDA_R_8U;
+    case INFINI_DTYPE_I8:
+        return CUDA_R_8I;
+    case INFINI_DTYPE_I32:
+        return CUDA_R_32I;
+    case INFINI_DTYPE_F16:
+        return CUDA_R_16F;
+    case INFINI_DTYPE_F32:
+        return CUDA_R_32F;
+    case INFINI_DTYPE_F64:
+        return CUDA_R_64F;
+    case INFINI_DTYPE_I16:
+        return CUDA_R_16I;
+    case INFINI_DTYPE_I64:
+        return CUDA_R_64I;
+    case INFINI_DTYPE_BF16:
+        return CUDA_R_16BF;
+    case INFINI_DTYPE_F8:
+        return (cudaDataType_t)CUDA_R_8F_E4M3;
+    default:
+        fprintf(stderr,
+                "Cannot convert ScalarType %d\n",
+                (int)scalar_type);
+        abort();
+    }
+}
+namespace op::gptq_qyblas_gemm::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t b_scales_desc,
+    infiniopTensorDescriptor_t b_zeros_desc) {
+
+    auto info = GptqQyblasGemmInfo::createGptqQyblasGemmInfo(out_desc, a_desc, b_desc, b_scales_desc, b_zeros_desc);
+
+    CHECK_RESULT(info);
+
+    size_t workspace_size = 0;
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), workspace_size, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace,
+                                     size_t workspace_size,
+                                     void *out,
+                                     const void *a,
+                                     const void *b,
+                                     void *b_scales,
+                                     void *b_zeros,
+                                     int64_t quant_type,
+                                     int64_t bit,
+                                     void *stream) const {
+    int64_t K = static_cast<int64_t>(_info.K);
+
+    cudaDataType_t computeType_ = (cudaDataType_t)CUDA_R_32F;
+    cudaDataType_t kernel_Atype_, kernel_Btype_, kernel_Ctype_, kernel_Stype_, kernel_Ztype_;
+    auto dtype = _info.dtype;
+    auto weight_dtype = _info.weight_dtype;
+    if (_info.transpose_result) {
+        std::swap(a, b);
+        std::swap(dtype, weight_dtype);
+    }
+    kernel_Atype_ = ScalarTypeToCudaDataType(dtype);
+    kernel_Btype_ = ScalarTypeToCudaDataType(weight_dtype);
+
+    if (quant_type == 0) {
+        if (8 == bit) {
+            kernel_Atype_ = (cudaDataType_t)CUDA_R_8U;
+        }
+
+        if (4 == bit) {
+            kernel_Atype_ = (cudaDataType_t)CUDA_R_4U;
+            K = K * 2;
+        }
+    }
+
+    kernel_Ctype_ = ScalarTypeToCudaDataType(_info.out_dtype);
+    kernel_Stype_ = ScalarTypeToCudaDataType(_info.scales_dtype);
+    kernel_Ztype_ = ScalarTypeToCudaDataType(_info.zeros_dtype);
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    int64_t M = static_cast<int64_t>(_info.M);
+    int64_t N = static_cast<int64_t>(_info.N);
+    int64_t lda = static_cast<int64_t>(_info.lda);
+    int64_t ldb = static_cast<int64_t>(_info.ldb);
+
+    int64_t scales_size_0 = static_cast<int64_t>(_info.scales_size_0);
+    int64_t scales_size_1 = static_cast<int64_t>(_info.scales_size_1);
+
+    int64_t result_ld = static_cast<int64_t>(_info.result_ld);
+
+    dlblasExtQuantParametersV2_t extParameters;
+
+    if (quant_type == 0) {
+        extParameters.a_group_size_m = M / scales_size_1;
+        extParameters.a_group_size_k = K / scales_size_0;
+        extParameters.a_zeropoints_type = kernel_Ztype_;
+        extParameters.a_zeropoints = b_zeros;
+        extParameters.a_scales_type = kernel_Stype_;
+        extParameters.a_scales = b_scales;
+    } else if (quant_type == 1) {
+        extParameters.a_group_size_m = 1;
+        extParameters.a_group_size_k = K;
+        extParameters.a_zeropoints = nullptr;
+        extParameters.a_scales_type = kernel_Stype_;
+        extParameters.a_scales = b_scales;
+
+    } else if (quant_type == 2 || quant_type == 3) {
+        // calculate block_shape according weight/scales shape
+        int block_shape = 128;
+        while ((M + block_shape - 1) / block_shape < scales_size_0) {
+            block_shape /= 2;
+            if (block_shape < 32) {
+                fprintf(stderr,
+                        "INTERNAL ASSERT FAILED: block_shape >= 32\n"
+                        "Invalid fp blockwise linear arguments. Weight: [%d, %d]. Scales: [%d, %d].\n",
+                        (int)M, (int)K, (int)scales_size_0, (int)scales_size_1);
+                abort();
+            }
+        }
+        if (!((K + block_shape - 1) / block_shape == scales_size_1)) {
+            fprintf(stderr,
+                    "CHECK FAILED: (K + block_shape - 1) / block_shape == scales_size_1\n");
+            abort();
+        }
+        extParameters.a_group_size_m = block_shape;
+        extParameters.a_group_size_k = block_shape;
+        extParameters.a_scales_type = kernel_Stype_;
+        extParameters.a_zeropoints = nullptr;
+        extParameters.a_scales = b_scales;
+    }
+    bool transpose_mat_1 = _info.transa == 't';
+    bool transpose_mat_2 = _info.transb == 't';
+    cublasOperation_t transa = transpose_mat_1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transb = transpose_mat_2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+    if (_info.dtype == INFINI_DTYPE_F16 || _info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(_opaque->internal->useCublas(
+            (cudaStream_t)stream,
+            [&](cublasHandle_t handle) {
+                CHECK_CUBLAS(
+                    dlblasGemmExV2(handle,
+                                   transa,
+                                   transb,
+                                   M,
+                                   N,
+                                   K,
+                                   &alpha,
+                                   a,
+                                   kernel_Atype_,
+                                   lda,
+                                   b,
+                                   kernel_Btype_,
+                                   ldb,
+                                   &beta,
+                                   out,
+                                   kernel_Ctype_,
+                                   result_ld,
+                                   computeType_,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP,
+                                   &extParameters));
+                return INFINI_STATUS_SUCCESS;
+            }));
+        return INFINI_STATUS_SUCCESS;
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::gptq_qyblas_gemm::nvidia
+#endif
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh
new file mode 100644
index 000000000..b489858d9
--- /dev/null
+++ b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__
+#define __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__
+#include "../gptq_qyblas_gemm.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GPTQ_QYBLAS_GEMM_NVIDIA_API_H__
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/operator.cc b/src/infiniop/ops/gptq_qyblas_gemm/operator.cc
new file mode 100644
index 000000000..e7bdf7791
--- /dev/null
+++ b/src/infiniop/ops/gptq_qyblas_gemm/operator.cc
@@ -0,0 +1,103 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gptq_qyblas_gemm.h"
+
+#if defined ENABLE_QY_API
+#include "nvidia/gptq_qyblas_gemm_nvidia.cuh"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateGptqQyblasGemmDescriptor(
+    infiniopHandle_t handle,
+    infiniopGptqQyblasGemmDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t b_scales_desc,
+    infiniopTensorDescriptor_t b_zeros_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        return op::gptq_qyblas_gemm::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                     \
+            reinterpret_cast<op::gptq_qyblas_gemm::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc, a_desc, b_desc, b_scales_desc, b_zeros_desc);
+
+    switch (handle->device) {
+
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetGptqQyblasGemmWorkspaceSize(
+    infiniopGptqQyblasGemmDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<op::gptq_qyblas_gemm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopGptqQyblasGemm(
+    infiniopGptqQyblasGemmDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *a,
+    const void *b,
+    void *b_scale,
+    void *b_zero,
+    int64_t quant_type,
+    int64_t bit,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                   \
+        return reinterpret_cast<op::gptq_qyblas_gemm::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, out, a, b, b_scale, b_zero, quant_type, bit, stream);
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyGptqQyblasGemmDescriptor(
+    infiniopGptqQyblasGemmDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                        \
+        delete reinterpret_cast<op::gptq_qyblas_gemm::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/test/infiniop/gptq_qyblas_gemm.py b/test/infiniop/gptq_qyblas_gemm.py
new file mode 100644
index 000000000..44c82ab4e
--- /dev/null
+++ b/test/infiniop/gptq_qyblas_gemm.py
@@ -0,0 +1,577 @@
+import torch
+import numpy
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+)
+from enum import Enum, auto
+import itertools
+from libinfiniop.scalar_type import scalar_types, ScalarType
+from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Union
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+# Test configurations
+
+BLOCK_SIZE = [[128, 128]]
+M_list = [1, 7]#, 83, 512, 2048]
+N_list = [128, 512]#, 1024, 4096, 7748, 13824]
+K_list = [256, 4096]#, 5120, 3884, 13824]
+
+_WEIGHT_DTYPES = [InfiniDtype.I8]
+
+SEEDS = 0
+
+def to_iter(x):
+    return x if isinstance(x, (list, tuple)) else (x,)
+
+
+_TEST_CASES = list(
+    itertools.product(
+        to_iter(M_list),
+        to_iter(K_list),
+        to_iter(N_list),
+        to_iter(BLOCK_SIZE),
+        to_iter(_WEIGHT_DTYPES),
+    )
+)
+
+_TEST_CASES_BIT = [
+    # M , K, N, group_size, bit
+    (128, 128, 128, 128, 4),
+    (32768, 3584, 4608, 128, 4),
+    (32768, 3584, 4608, 128, 8),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16]
+
+_TENSOR_DTYPES_BIT = [InfiniDtype.BF16, InfiniDtype.F16]
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def native_w8a16_block_int8_matmul(
+    A,
+    B,
+    Bs,
+    block_size,
+    output_dtype: torch.float16,
+) -> torch.Tensor:
+    """Matrix multiplication with block-wise quantization using native torch."""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[j * block_n:min((j + 1) * block_n, N),
+          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def _gguf_quantize_weights(w: torch.Tensor,
+                     quant_type: ScalarType,
+                     group_size: Optional[int],
+                     zero_points: bool = False,
+                     ref_zero_points_after_scales: bool = False,
+                     need_weight_ref: bool = True):
+    assert quant_type.is_integer(), \
+        "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, \
+        "to have group zero points, group_size must be provided "\
+        "(-1 group_size is channelwise)"
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
+                .clamp(min_q_val, max_q_val).int()
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if need_weight_ref:
+        if ref_zero_points_after_scales and maybe_w_zp is not None:
+            w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+        else:
+            w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        if need_weight_ref:
+            w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device) if need_weight_ref else None,
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+def gguf_quantize_weights(w: torch.Tensor,
+                          group_size: int,
+                          zero_points: bool = False,
+                          need_weight_ref: bool = False,
+                          bits: int = 4,
+                          ref_zero_points_after_scales: bool = False,
+                          params_dtype: torch.dtype = torch.float16):
+    size_k, _ = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    w_ref, w_q, w_s, w_z = _gguf_quantize_weights(w, quant_type=scalar_types.uint4 if bits == 4 else scalar_types.uint8,
+                                          group_size=group_size,
+                                          zero_points=zero_points,
+                                          need_weight_ref=need_weight_ref,
+                                          ref_zero_points_after_scales=ref_zero_points_after_scales)
+
+    if zero_points:
+        w_z = w_z.to(params_dtype)
+
+    w_q = w_q.to(torch.uint8)
+
+    return w_ref, w_q, w_s, w_z
+
+def gguf_linear_quantize_weights(w: torch.Tensor,
+                          group_size: int,
+                          zero_points: bool = False,
+                          need_weight_ref: bool = False,
+                          bits: int =4,
+                          params_dtype: torch.dtype = torch.float16):
+    w_ref, w_q, w_s, w_z = gguf_quantize_weights(
+        w=w,
+        group_size=group_size,
+        zero_points=zero_points,
+        need_weight_ref=need_weight_ref,
+        bits=bits,
+        ref_zero_points_after_scales=False,
+        params_dtype=params_dtype,
+    )
+
+    if bits == 4:
+        w_q = (w_q[:,1::2] << 4) | w_q[:, ::2]
+        w_q = w_q.reshape(w_q.shape[0]//2, -1) # This step is to match the parameters of the dlblasGemmExV2
+
+    return w_ref, w_q, w_s, w_z
+
+
+
+def test(
+    handle,
+    device,
+    M,
+    K,
+    N,
+    block_size,
+    weight_dtype=InfiniDtype.I8,
+    dtype=InfiniDtype.BF16,
+    sync=None,
+):
+
+    print(
+        f"Testing int8 Gptq Qyblas Gemm on {InfiniDeviceNames[device]} with M-K-N:{M, K, N}, block_size:{block_size}, weight dtype:{InfiniDtypeNames[weight_dtype]}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+    quant_type = 3
+    bit = 8
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    A = TestTensor(
+        (M, K),
+        None,
+        dtype,
+        device,
+    )
+    if weight_dtype == InfiniDtype.I8:
+        _info = torch.iinfo(torch.int8)
+    elif weight_dtype == InfiniDtype.U8:
+        _info = torch.iinfo(torch.uint8)
+    elif weight_dtype == InfiniDtype.F8:
+        _info = torch.iinfo(float8_e4m3fn)
+    B_orig = TestTensor(
+        (N, K),
+        None,
+        weight_dtype,
+        device,
+        randint_low=_info.min,
+        randint_high=_info.max,
+    )
+    B_torch = B_orig.torch_tensor().t()
+    B = TestTensor(
+        (K, N),
+        B_torch.stride(),
+        weight_dtype,
+        device,
+        mode="manual",
+        set_tensor=B_torch,
+    )
+    
+    b_scales = TestTensor(
+        (n_tiles, k_tiles),
+        None,
+        InfiniDtype.F32,
+        device,
+    )
+
+    b_zeros = TestTensor(
+        (n_tiles, k_tiles),
+        None,
+        InfiniDtype.F32,
+        device,
+        mode="zeros",
+    )
+    
+    out = TestTensor(
+        (M, N),
+        None,
+        dtype,
+        device,
+        mode="zeros",
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGptqQyblasGemmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            A.descriptor,
+            B.descriptor,
+            b_scales.descriptor,
+            b_zeros.descriptor,
+        )
+    )
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    for tensor in [out, A, B, b_scales, b_zeros]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGptqQyblasGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, A.device)
+
+    def lib_gptq_qyblas_gemm():
+        check_error(
+            LIBINFINIOP.infiniopGptqQyblasGemm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                A.data(),
+                B.data(),
+                b_scales.data(),
+                b_zeros.data(),
+                quant_type,
+                bit,
+                None,
+            )
+        )
+
+    lib_gptq_qyblas_gemm()
+
+    if sync is not None:
+        sync()
+
+    out_dtype = to_torch_dtype(dtype)
+    ans = native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype)
+    
+    rel_diff = (torch.mean(
+        torch.abs(out.actual_tensor().to(torch.float32) - ans.to(torch.float32))) /
+                torch.mean(torch.abs(ans.to(torch.float32))))
+
+    assert rel_diff < 0.05
+    
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gptq_qyblas_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor))
+
+
+def test_bit(
+    handle,
+    device,
+    M,
+    K,
+    N,
+    group_size,
+    bit,
+    dtype=InfiniDtype.BF16,
+    sync=None,
+):
+
+    print(
+        f"Testing Gptq Qyblas Gemm on {InfiniDeviceNames[device]} with M-K-N:{M, K, N}, group_size:{group_size}, bit:{bit}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+    quant_type = 0
+    bit = 4
+
+    k_tiles = (K + group_size - 1) // group_size
+
+    A = TestTensor(
+        (M, K),
+        None,
+        dtype,
+        device,
+    )
+    B_orig = TestTensor(
+        (K, N),
+        None,
+        dtype,
+        device,
+    )
+    w_ref, w_q, w_s, w_z = gguf_linear_quantize_weights(B_orig.torch_tensor(),
+                                                 group_size=group_size,
+                                                 zero_points=True,
+                                                 need_weight_ref=True,
+                                                 bits=bit,
+                                                 params_dtype=to_torch_dtype(dtype))                                           
+    
+    B = TestTensor(
+        w_q.shape,
+        w_q.stride(),
+        InfiniDtype.U8,
+        device,
+        mode="manual",
+        set_tensor=w_q,
+    )
+
+    
+    b_scales = TestTensor(
+        w_s.shape,
+        w_s.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=w_s,
+    )
+
+    b_zeros = TestTensor(
+        w_z.shape,
+        w_z.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=w_z,
+    )
+    
+    out = TestTensor(
+        (M, N),
+        None,
+        dtype,
+        device,
+        mode="zeros",
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGptqQyblasGemmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            A.descriptor,
+            B.descriptor,
+            b_scales.descriptor,
+            b_zeros.descriptor,
+        )
+    )
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    for tensor in [out, A, B, b_scales, b_zeros]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGptqQyblasGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, A.device)
+
+    def lib_gptq_qyblas_gemm():
+        check_error(
+            LIBINFINIOP.infiniopGptqQyblasGemm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                A.data(),
+                B.data(),
+                b_scales.data(),
+                b_zeros.data(),
+                quant_type,
+                bit,
+                None,
+            )
+        )
+
+    lib_gptq_qyblas_gemm()
+
+    if sync is not None:
+        sync()
+    
+    atol, rtol = 2e-2, 2e-2
+    if bit == 8:
+        atol, rtol = 2e-2, 0
+    else:
+        atol, rtol = 2e-2, 2e-2
+    ans = torch.matmul(A.torch_tensor(), w_ref.to(A.torch_tensor().device))
+    if DEBUG:
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+    
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.matmul(A.torch_tensor(), w_ref.to(A.torch_tensor().device)), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gptq_qyblas_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    for device in get_test_devices(args):
+        test_operator(device, test_bit, _TEST_CASES_BIT, _TENSOR_DTYPES_BIT)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 9a91c931c..26ade4c2a 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -13,6 +13,7 @@
     c_double,
     c_int64,
     c_bool,
+    c_int64,
 )
 
 
@@ -1313,6 +1314,45 @@ def per_tensor_dequant_int8_(lib):
     ]
 
 
+@OpRegister.operator
+def gptq_qyblas_gemm_(lib):
+    lib.infiniopCreateGptqQyblasGemmDescriptor.restype = c_int32
+    lib.infiniopCreateGptqQyblasGemmDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGptqQyblasGemmWorkspaceSize.restype = c_int32
+    lib.infiniopGetGptqQyblasGemmWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGptqQyblasGemm.restype = c_int32
+    lib.infiniopGptqQyblasGemm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_int64,
+        c_int64,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGptqQyblasGemmDescriptor.restype = c_int32
+    lib.infiniopDestroyGptqQyblasGemmDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
diff --git a/test/infiniop/libinfiniop/scalar_type.py b/test/infiniop/libinfiniop/scalar_type.py
new file mode 100644
index 000000000..af74a3c6e
--- /dev/null
+++ b/test/infiniop/libinfiniop/scalar_type.py
@@ -0,0 +1,356 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import struct
+from dataclasses import dataclass
+from enum import Enum
+
+_SCALAR_TYPES_ID_MAP = {}
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+# This ScalarType class is a parallel implementation of the C++ ScalarType
+# class found in csrc/core/scalar_type.hpp.  These two classes should be kept
+# in sync until the inductor fully supports custom C++ classes.
+@dataclass(frozen=True)
+class ScalarType:
+    """
+    ScalarType can represent a wide range of floating point and integer
+    types, in particular it can be used to represent sub-byte data types
+    (something that torch.dtype currently does not support). It is also
+    capable of  representing types with a bias, i.e.:
+      `stored_value = value + bias`,
+    this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias
+    of 8). The implementation for this class can be found in
+    csrc/core/scalar_type.hpp, these type signatures should be kept in sync
+    with that file.
+    """
+
+    exponent: int
+    """
+    Number of bits in the exponent if this is a floating point type
+    (zero if this an integer type)
+    """
+
+    mantissa: int
+    """
+    Number of bits in the mantissa if this is a floating point type,
+    or the number bits representing an integer excluding the sign bit if
+    this an integer type.
+    """
+
+    signed: bool
+    "If the type is signed (i.e. has a sign bit)"
+
+    bias: int
+    """
+    bias used to encode the values in this scalar type
+    (value = stored_value - bias, default 0) for example if we store the
+    type as an unsigned integer with a bias of 128 then the value 0 will be
+    stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+    """
+
+    _finite_values_only: bool = False
+    """
+    Private: if infs are supported, used `has_infs()` instead.
+    """
+
+    nan_repr: NanRepr = NanRepr.IEEE_754
+    """
+    How NaNs are represent in this scalar type, returns NanRepr value.
+    (not applicable for integer types)
+    """
+
+    def _floating_point_max_int(self) -> int:
+        assert self.mantissa <= 52 and self.exponent <= 11, (
+            f"Cannot represent max/min as a double for type {self.__str__()}"
+        )
+
+        max_mantissa = (1 << self.mantissa) - 1
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
+            max_mantissa = max_mantissa - 1
+
+        max_exponent = (1 << self.exponent) - 2
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN or self.nan_repr == NanRepr.NONE:
+            assert self.exponent < 11, (
+                f"Cannot represent max/min as a double for type {self.__str__()}"
+            )
+            max_exponent = max_exponent + 1
+
+        # adjust the exponent to match that of a double
+        # for now we assume the exponent bias is the standard 2^(e-1) -1, (where
+        # e is the exponent bits), there is some precedent for non-standard
+        # biases, example `float8_e4m3b11fnuz` here:
+        # https://github.com/jax-ml/ml_dtypes but to avoid premature over
+        # complication we are just assuming the standard exponent bias until
+        # there is a need to support non-standard biases
+        exponent_bias = (1 << (self.exponent - 1)) - 1
+        exponent_bias_double = (1 << 10) - 1  # double e = 11
+
+        max_exponent_double = max_exponent - exponent_bias + exponent_bias_double
+
+        # shift the mantissa and exponent into the proper positions for an
+        # IEEE double and bitwise-or them together.
+        return (max_mantissa << (52 - self.mantissa)) | (max_exponent_double << 52)
+
+    def _floating_point_max(self) -> float:
+        double_raw = self._floating_point_max_int()
+        return struct.unpack("!d", struct.pack("!Q", double_raw))[0]
+
+    def _raw_max(self) -> int | float:
+        if self.is_floating_point():
+            return self._floating_point_max()
+        else:
+            assert self.size_bits < 64 or self.size_bits == 64 and self.is_signed(), (
+                "Cannot represent max as an int"
+            )
+            return (1 << self.mantissa) - 1
+
+    def _raw_min(self) -> int | float:
+        if self.is_floating_point():
+            assert self.is_signed(), (
+                "We currently assume all floating point types are signed"
+            )
+            sign_bit_double = 1 << 63
+
+            max_raw = self._floating_point_max_int()
+            min_raw = max_raw | sign_bit_double
+            return struct.unpack("!d", struct.pack("!Q", min_raw))[0]
+        else:
+            assert not self.is_signed() or self.size_bits <= 64, (
+                "Cannot represent min as a int64_t"
+            )
+
+            if self.is_signed():
+                return -(1 << (self.size_bits - 1))
+            else:
+                return 0
+
+    @functools.cached_property
+    def id(self) -> int:
+        """
+        Convert the ScalarType to an int which can be passed to pytorch custom
+        ops. This layout of the int must be kept in sync with the C++
+        ScalarType's from_id method.
+        """
+        val = 0
+        offset = 0
+
+        def or_and_advance(member, bit_width):
+            nonlocal val
+            nonlocal offset
+            bit_mask = (1 << bit_width) - 1
+            val = val | (int(member) & bit_mask) << offset
+            offset = offset + bit_width
+
+        or_and_advance(self.exponent, 8)
+        or_and_advance(self.mantissa, 8)
+        or_and_advance(self.signed, 1)
+        or_and_advance(self.bias, 32)
+        or_and_advance(self._finite_values_only, 1)
+        or_and_advance(self.nan_repr.value, 8)
+
+        assert offset <= 64, f"ScalarType fields too big {offset} to fit into an int64"
+
+        _SCALAR_TYPES_ID_MAP[val] = self
+
+        return val
+
+    @property
+    def size_bits(self) -> int:
+        return self.exponent + self.mantissa + int(self.signed)
+
+    def min(self) -> int | float:
+        """
+        Min representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_min() - self.bias
+
+    def max(self) -> int | float:
+        """
+        Max representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_max() - self.bias
+
+    def is_signed(self) -> bool:
+        """
+        If the type is signed (i.e. has a sign bit), same as `signed`
+        added for consistency with:
+        https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+        """
+        return self.signed
+
+    def is_floating_point(self) -> bool:
+        "If the type is a floating point type"
+        return self.exponent != 0
+
+    def is_integer(self) -> bool:
+        "If the type is an integer type"
+        return self.exponent == 0
+
+    def has_bias(self) -> bool:
+        "If the type has a non-zero bias"
+        return self.bias != 0
+
+    def has_infs(self) -> bool:
+        "If the type is floating point and supports infinity"
+        return not self._finite_values_only
+
+    def has_nans(self) -> bool:
+        return self.nan_repr != NanRepr.NONE.value
+
+    def is_ieee_754(self) -> bool:
+        """
+        If the type is a floating point type that follows IEEE 754
+        conventions
+        """
+        return self.nan_repr == NanRepr.IEEE_754.value and not self._finite_values_only
+
+    def __str__(self) -> str:
+        """
+        naming generally follows: https://github.com/jax-ml/ml_dtypes
+        for floating point types (leading f) the scheme is:
+        `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+        flags:
+          - no-flags: means it follows IEEE 754 conventions
+          - f: means finite values only (no infinities)
+          - n: means nans are supported (non-standard encoding)
+        for integer types the scheme is:
+          `[u]int<size_bits>[b<bias>]`
+          - if bias is not present it means its zero
+        """
+        if self.is_floating_point():
+            ret = (
+                "float"
+                + str(self.size_bits)
+                + "_e"
+                + str(self.exponent)
+                + "m"
+                + str(self.mantissa)
+            )
+
+            if not self.is_ieee_754():
+                if self._finite_values_only:
+                    ret = ret + "f"
+                if self.nan_repr != NanRepr.NONE:
+                    ret = ret + "n"
+
+            return ret
+        else:
+            ret = ("int" if self.is_signed() else "uint") + str(self.size_bits)
+            if self.has_bias():
+                ret = ret + "b" + str(self.bias)
+            return ret
+
+    def __repr__(self) -> str:
+        return "ScalarType." + self.__str__()
+
+    # __len__ needs to be defined (and has to throw TypeError) for pytorch's
+    # opcheck to work.
+    def __len__(self) -> int:
+        raise TypeError
+
+    #
+    # Convenience Constructors
+    #
+
+    @classmethod
+    def int_(cls, size_bits: int, bias: int | None) -> "ScalarType":
+        "Create a signed integer scalar type (size_bits includes sign-bit)."
+        ret = cls(0, size_bits - 1, True, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def uint(cls, size_bits: int, bias: int | None) -> "ScalarType":
+        """Create an unsigned integer scalar type."""
+        ret = cls(0, size_bits, False, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_IEEE754(cls, exponent: int, mantissa: int) -> "ScalarType":
+        """
+        Create a standard floating point type
+        (i.e. follows IEEE 754 conventions).
+        """
+        assert mantissa > 0 and exponent > 0
+        ret = cls(exponent, mantissa, True, 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_(
+        cls, exponent: int, mantissa: int, finite_values_only: bool, nan_repr: NanRepr
+    ) -> "ScalarType":
+        """
+        Create a non-standard floating point type
+        (i.e. does not follow IEEE 754 conventions).
+        """
+        assert mantissa > 0 and exponent > 0
+        assert nan_repr != NanRepr.IEEE_754, (
+            "use `float_IEEE754` constructor for floating point types that "
+            "follow IEEE 754 conventions"
+        )
+        ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
+
+
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+
+
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float8_e8m0fnu = ScalarType(8, 0, False, 0, True, NanRepr.EXTD_RANGE_MAX_MIN)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    # and https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+
+    float6_e2m3f = ScalarType.float_(2, 3, True, NanRepr.NONE)
+
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
+
+    # "gptq" types
+    uint2b2 = ScalarType.uint(2, 2)
+    uint3b4 = ScalarType.uint(3, 4)
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10
+
diff --git a/xmake.lua b/xmake.lua
index 8f32bf7cc..c69e0170d 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -362,6 +362,9 @@ target("infiniop")
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/nvidia/*.cu.o", {public = true})
         add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/qy/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/qy/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/qy/*.cu.o", {public = true})
     end
 
     if has_config("cambricon-mlu") then
diff --git a/xmake/qy.lua b/xmake/qy.lua
index a2fe269b8..31b65c33c 100644
--- a/xmake/qy.lua
+++ b/xmake/qy.lua
@@ -142,6 +142,7 @@ target("infiniop-qy")
 
     set_languages("cxx17")
     add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/qy/*.cu", "../src/infiniop/ops/*/qy/*.cu", "../src/infiniop/ops/*/*/qy/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")