diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 1d1585d6c..598ddd5ec 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -63,6 +63,7 @@ jobs:
           - redisearch
           - qdrant
           - qg_ngt
+          - qingming
           - qsg_ngt
           - scann
           - sptag
diff --git a/ann_benchmarks/algorithms/qingming/Dockerfile b/ann_benchmarks/algorithms/qingming/Dockerfile
new file mode 100644
index 000000000..c0cf2f370
--- /dev/null
+++ b/ann_benchmarks/algorithms/qingming/Dockerfile
@@ -0,0 +1,33 @@
+# 使用 AMD 官方镜像
+FROM rocm/dev-ubuntu-22.04:6.2
+
+LABEL maintainer="qingming-flat"
+
+# 安装依赖
+RUN sudo apt-get update && sudo apt-get install -y python3-pip git
+
+# 安装 Python 库
+RUN pip3 install numpy h5py pybind11
+
+# [SABO FIX: 关键路径修正]
+# 1. 先创建完整的 Python 包目录结构
+WORKDIR /home/app/ann_benchmarks/algorithms/qingming
+
+# 2. 把当前目录的所有文件(cpp, py, yml) 复制进这个深层目录
+COPY . .
+
+# 3. 原地编译
+# 现在的上下文就在 .../qingming 下，hipcc 能找到文件
+RUN hipcc -O3 -shared -fPIC -std=c++17 \
+    --offload-arch=gfx1100 \
+    -x hip qingming_hip.cpp \
+    $(python3 -m pybind11 --includes) \
+    -o qingming_hip$(python3-config --extension-suffix)
+
+# 4. 回到根目录准备运行
+WORKDIR /home/app
+
+# 5. [CRITICAL] 确保 Python 能从 /home/app 找到包
+ENV PYTHONPATH=/home/app
+
+ENTRYPOINT ["python3", "-u", "run.py"]
diff --git a/ann_benchmarks/algorithms/qingming/__init__.py b/ann_benchmarks/algorithms/qingming/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/ann_benchmarks/algorithms/qingming/__init__.py
@@ -0,0 +1 @@
+
diff --git a/ann_benchmarks/algorithms/qingming/config.yml b/ann_benchmarks/algorithms/qingming/config.yml
new file mode 100644
index 000000000..b03e4894e
--- /dev/null
+++ b/ann_benchmarks/algorithms/qingming/config.yml
@@ -0,0 +1,13 @@
+float:
+  any:
+  - base_args: ['@metric']
+    constructor: QingMing
+    disabled: false
+    docker_tag: ann-benchmarks-qingming
+    # [CORRECTION] 指向包目录，而非文件
+    module: ann_benchmarks.algorithms.qingming
+    name: qingming
+    run_groups:
+      base:
+        args:
+          k: [100]
diff --git a/ann_benchmarks/algorithms/qingming/module.py b/ann_benchmarks/algorithms/qingming/module.py
new file mode 100644
index 000000000..35ea0b10f
--- /dev/null
+++ b/ann_benchmarks/algorithms/qingming/module.py
@@ -0,0 +1,53 @@
+import numpy as np
+import ann_benchmarks.algorithms.qingming.qingming_hip as engine
+from ann_benchmarks.algorithms.base.module import BaseANN
+
+class QingMing(BaseANN):
+    def __init__(self, metric, method_param):
+        self.metric = metric
+        self.k = method_param.get("k", 100)
+        # 必须与 config.yml 的 name 一致
+        self.name = "qingming"
+        # [Strict Protocol] 初始化结果容器
+        self.res = []
+
+    def fit(self, X):
+        print(f"[{self.name}] Indexing {X.shape} on AMD GPU...")
+        
+        if self.metric == 'angular':
+            norms = np.linalg.norm(X, axis=1, keepdims=True)
+            norms[norms == 0] = 1.0
+            X = X / norms
+            
+        self._data = np.array(X, dtype=np.float32, order='C')
+        self.engine = engine.QingMingEngine(self._data)
+
+    def set_query_arguments(self, k):
+        self.k = k
+
+    def batch_query(self, X, n):
+        # [Strict Protocol] 只计算，不返回，存入 self.res
+        if self.metric == 'angular':
+             norms = np.linalg.norm(X, axis=1, keepdims=True)
+             norms[norms == 0] = 1.0
+             X = X / norms
+        
+        X = np.array(X, dtype=np.float32, order='C')
+        
+        # C++ 返回的是 numpy array
+        # 官方框架有时需要 list of list，但 numpy array 通常也兼容
+        # 为了稳妥，如果你之前的 .so 返回的是 numpy array，这里直接赋值
+        self.res = self.engine.query(X, n)
+
+    # get_batch_results 使用基类的默认实现即可（它就是 return self.res）
+    # 所以不需要重写它
+
+    def query(self, v, n):
+        # Latency 模式
+        if self.metric == 'angular':
+            v = v / np.linalg.norm(v)
+        v_batch = np.array([v], dtype=np.float32)
+        return self.engine.query(v_batch, n)[0]
+
+    def __str__(self):
+        return self.name
diff --git a/ann_benchmarks/algorithms/qingming/qingming_hip.cpp b/ann_benchmarks/algorithms/qingming/qingming_hip.cpp
new file mode 100644
index 000000000..3aab3199f
--- /dev/null
+++ b/ann_benchmarks/algorithms/qingming/qingming_hip.cpp
@@ -0,0 +1,181 @@
+/*
+ * QINGMING-ENGINE: AMD RDNA3 EDITION (Navi 31)
+ * WRAPPER: PyBind11 for ANN-Benchmarks
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_vector_types.h>
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+#define TOP_K 100
+#define WAVE_SIZE 32
+#define THREADS_PER_BLOCK 256
+
+#define HIP_CHECK(call) do { \
+    hipError_t err = call; \
+    if (err != hipSuccess) { \
+        throw std::runtime_error(std::string("HIP Error: ") + hipGetErrorString(err)); \
+    } \
+} while (0)
+
+__device__ inline float wave_reduce_sum(float val) {
+    for (int offset = WAVE_SIZE / 2; offset > 0; offset /= 2) {
+        val += __shfl_down(val, offset);
+    }
+    return val;
+}
+
+// ---------------- KERNELS (保留你的原始逻辑) ----------------
+
+template <int DIM>
+__global__ __launch_bounds__(256) 
+void kernel_throughput_float4(const float4* __restrict__ queries, const float4* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) {
+    int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE;
+    int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id;
+    if (qid >= batch_size) return;
+    
+    constexpr int ITER = (DIM / 4) / WAVE_SIZE; 
+    float4 my_q[ITER];
+    const float4* q_ptr = queries + qid * (DIM / 4);
+    
+    #pragma unroll
+    for(int i=0; i<ITER; ++i) my_q[i] = q_ptr[lane_id + i*WAVE_SIZE];
+    
+    float best_dists[TOP_K]; int best_ids[TOP_K];
+    if (lane_id == 0) for(int k=0; k<TOP_K; ++k) { best_dists[k] = 1e30f; best_ids[k] = -1; }
+    
+    for (int i = 0; i < n_base; ++i) {
+        const float4* vec_ptr = database + i * (DIM / 4);
+        float dist = 0.0f;
+        #pragma unroll
+        for(int k=0; k<ITER; ++k) {
+            float4 v = vec_ptr[lane_id + k*WAVE_SIZE];
+            dist += (my_q[k].x - v.x)*(my_q[k].x - v.x) + (my_q[k].y - v.y)*(my_q[k].y - v.y) + (my_q[k].z - v.z)*(my_q[k].z - v.z) + (my_q[k].w - v.w)*(my_q[k].w - v.w);
+        }
+        dist = wave_reduce_sum(dist);
+        if (lane_id == 0 && dist < best_dists[TOP_K-1]) {
+            int pos = TOP_K - 1; while (pos > 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; }
+            best_dists[pos] = dist; best_ids[pos] = i; 
+        }
+    }
+    if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k];
+}
+
+template <int DIM>
+__global__ __launch_bounds__(256) 
+void kernel_throughput_float(const float* __restrict__ queries, const float* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) {
+    int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE;
+    int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id;
+    if (qid >= batch_size) return;
+    constexpr int ITER = DIM / WAVE_SIZE; 
+    float my_q[ITER];
+    const float* q_ptr = queries + qid * DIM;
+    #pragma unroll
+    for(int i=0; i<ITER; ++i) my_q[i] = q_ptr[lane_id + i*WAVE_SIZE];
+    float best_dists[TOP_K]; int best_ids[TOP_K];
+    if (lane_id == 0) for(int k=0; k<TOP_K; ++k) { best_dists[k] = 1e30f; best_ids[k] = -1; }
+    for (int i = 0; i < n_base; ++i) {
+        const float* vec_ptr = database + i * DIM;
+        float dist = 0.0f;
+        #pragma unroll
+        for(int k=0; k<ITER; ++k) {
+            float v = vec_ptr[lane_id + k*WAVE_SIZE]; float diff = my_q[k] - v; dist += diff * diff;
+        }
+        dist = wave_reduce_sum(dist);
+        if (lane_id == 0 && dist < best_dists[TOP_K-1]) {
+            int pos = TOP_K - 1; while (pos > 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; }
+            best_dists[pos] = dist; best_ids[pos] = i; 
+        }
+    }
+    if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k];
+}
+
+// ---------------- ENGINE WRAPPER ----------------
+
+class QingMingEngine {
+    void *d_database; 
+    void *d_queries;
+    int *d_results;
+    int dimension;
+    int n_base;
+    size_t max_q_alloc;
+
+public:
+    QingMingEngine(py::array_t<float> db_data) {
+        auto buf = db_data.request();
+        n_base = buf.shape[0];
+        dimension = buf.shape[1];
+        
+        size_t db_size = n_base * dimension * sizeof(float);
+        HIP_CHECK(hipMalloc(&d_database, db_size));
+        HIP_CHECK(hipMemcpy(d_database, buf.ptr, db_size, hipMemcpyHostToDevice));
+
+        max_q_alloc = 20000;
+        HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float)));
+        HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int)));
+    }
+
+    ~QingMingEngine() {
+        hipFree(d_database);
+        hipFree(d_queries);
+        hipFree(d_results);
+    }
+
+    py::array_t<int> query(py::array_t<float> q_data, int k) {
+        auto buf = q_data.request();
+        int n_q = buf.shape[0];
+        
+        // 动态扩容
+        if (n_q > max_q_alloc) {
+            hipFree(d_queries); hipFree(d_results);
+            max_q_alloc = n_q * 1.5;
+            HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float)));
+            HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int)));
+        }
+
+        HIP_CHECK(hipMemcpy(d_queries, buf.ptr, n_q * dimension * sizeof(float), hipMemcpyHostToDevice));
+
+        int threads = 256;
+        int blocks = (n_q * WAVE_SIZE + threads - 1) / threads;
+
+        if (dimension == 128) {
+            kernel_throughput_float4<128><<<blocks, threads>>>((float4*)d_queries, (float4*)d_database, n_base, d_results, n_q);
+        } else if (dimension == 960) {
+            kernel_throughput_float<960><<<blocks, threads>>>((float*)d_queries, (float*)d_database, n_base, d_results, n_q);
+        } else {
+            // Fallback for other dims if needed, or throw error
+             throw std::runtime_error("Unsupported Dimension for AMD Kernel");
+        }
+        
+        HIP_CHECK(hipDeviceSynchronize());
+
+        std::vector<int> host_res(n_q * TOP_K);
+        HIP_CHECK(hipMemcpy(host_res.data(), d_results, n_q * TOP_K * sizeof(int), hipMemcpyDeviceToHost));
+        
+        // 截取用户需要的 Top-K
+        if (k != TOP_K) {
+            std::vector<int> final_res(n_q * k);
+            for(int i=0; i<n_q; ++i) {
+                for(int j=0; j<k; ++j) {
+                    final_res[i*k + j] = host_res[i*TOP_K + j];
+                }
+            }
+            return py::array_t<int>({n_q, k}, final_res.data());
+        }
+
+        return py::array_t<int>({n_q, TOP_K}, host_res.data());
+    }
+};
+
+PYBIND11_MODULE(qingming_hip, m) {
+    py::class_<QingMingEngine>(m, "QingMingEngine")
+        .def(py::init<py::array_t<float>>())
+        .def("query", &QingMingEngine::query);
+}