diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 1d1585d6c..598ddd5ec 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -63,6 +63,7 @@ jobs: - redisearch - qdrant - qg_ngt + - qingming - qsg_ngt - scann - sptag diff --git a/ann_benchmarks/algorithms/qingming/Dockerfile b/ann_benchmarks/algorithms/qingming/Dockerfile new file mode 100644 index 000000000..c0cf2f370 --- /dev/null +++ b/ann_benchmarks/algorithms/qingming/Dockerfile @@ -0,0 +1,33 @@ +# 使用 AMD 官方镜像 +FROM rocm/dev-ubuntu-22.04:6.2 + +LABEL maintainer="qingming-flat" + +# 安装依赖 +RUN sudo apt-get update && sudo apt-get install -y python3-pip git + +# 安装 Python 库 +RUN pip3 install numpy h5py pybind11 + +# [SABO FIX: 关键路径修正] +# 1. 先创建完整的 Python 包目录结构 +WORKDIR /home/app/ann_benchmarks/algorithms/qingming + +# 2. 把当前目录的所有文件(cpp, py, yml) 复制进这个深层目录 +COPY . . + +# 3. 原地编译 +# 现在的上下文就在 .../qingming 下,hipcc 能找到文件 +RUN hipcc -O3 -shared -fPIC -std=c++17 \ + --offload-arch=gfx1100 \ + -x hip qingming_hip.cpp \ + $(python3 -m pybind11 --includes) \ + -o qingming_hip$(python3-config --extension-suffix) + +# 4. 回到根目录准备运行 +WORKDIR /home/app + +# 5. [CRITICAL] 确保 Python 能从 /home/app 找到包 +ENV PYTHONPATH=/home/app + +ENTRYPOINT ["python3", "-u", "run.py"] diff --git a/ann_benchmarks/algorithms/qingming/__init__.py b/ann_benchmarks/algorithms/qingming/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/ann_benchmarks/algorithms/qingming/__init__.py @@ -0,0 +1 @@ + diff --git a/ann_benchmarks/algorithms/qingming/config.yml b/ann_benchmarks/algorithms/qingming/config.yml new file mode 100644 index 000000000..b03e4894e --- /dev/null +++ b/ann_benchmarks/algorithms/qingming/config.yml @@ -0,0 +1,13 @@ +float: + any: + - base_args: ['@metric'] + constructor: QingMing + disabled: false + docker_tag: ann-benchmarks-qingming + # [CORRECTION] 指向包目录,而非文件 + module: ann_benchmarks.algorithms.qingming + name: qingming + run_groups: + base: + args: + k: [100] diff --git a/ann_benchmarks/algorithms/qingming/module.py b/ann_benchmarks/algorithms/qingming/module.py new file mode 100644 index 000000000..35ea0b10f --- /dev/null +++ b/ann_benchmarks/algorithms/qingming/module.py @@ -0,0 +1,53 @@ +import numpy as np +import ann_benchmarks.algorithms.qingming.qingming_hip as engine +from ann_benchmarks.algorithms.base.module import BaseANN + +class QingMing(BaseANN): + def __init__(self, metric, method_param): + self.metric = metric + self.k = method_param.get("k", 100) + # 必须与 config.yml 的 name 一致 + self.name = "qingming" + # [Strict Protocol] 初始化结果容器 + self.res = [] + + def fit(self, X): + print(f"[{self.name}] Indexing {X.shape} on AMD GPU...") + + if self.metric == 'angular': + norms = np.linalg.norm(X, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + X = X / norms + + self._data = np.array(X, dtype=np.float32, order='C') + self.engine = engine.QingMingEngine(self._data) + + def set_query_arguments(self, k): + self.k = k + + def batch_query(self, X, n): + # [Strict Protocol] 只计算,不返回,存入 self.res + if self.metric == 'angular': + norms = np.linalg.norm(X, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + X = X / norms + + X = np.array(X, dtype=np.float32, order='C') + + # C++ 返回的是 numpy array + # 官方框架有时需要 list of list,但 numpy array 通常也兼容 + # 为了稳妥,如果你之前的 .so 返回的是 numpy array,这里直接赋值 + self.res = self.engine.query(X, n) + + # get_batch_results 使用基类的默认实现即可(它就是 return self.res) + # 所以不需要重写它 + + def query(self, v, n): + # Latency 模式 + if self.metric == 'angular': + v = v / np.linalg.norm(v) + v_batch = np.array([v], dtype=np.float32) + return self.engine.query(v_batch, n)[0] + + def __str__(self): + return self.name diff --git a/ann_benchmarks/algorithms/qingming/qingming_hip.cpp b/ann_benchmarks/algorithms/qingming/qingming_hip.cpp new file mode 100644 index 000000000..3aab3199f --- /dev/null +++ b/ann_benchmarks/algorithms/qingming/qingming_hip.cpp @@ -0,0 +1,181 @@ +/* + * QINGMING-ENGINE: AMD RDNA3 EDITION (Navi 31) + * WRAPPER: PyBind11 for ANN-Benchmarks + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; + +#define TOP_K 100 +#define WAVE_SIZE 32 +#define THREADS_PER_BLOCK 256 + +#define HIP_CHECK(call) do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + throw std::runtime_error(std::string("HIP Error: ") + hipGetErrorString(err)); \ + } \ +} while (0) + +__device__ inline float wave_reduce_sum(float val) { + for (int offset = WAVE_SIZE / 2; offset > 0; offset /= 2) { + val += __shfl_down(val, offset); + } + return val; +} + +// ---------------- KERNELS (保留你的原始逻辑) ---------------- + +template +__global__ __launch_bounds__(256) +void kernel_throughput_float4(const float4* __restrict__ queries, const float4* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) { + int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE; + int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id; + if (qid >= batch_size) return; + + constexpr int ITER = (DIM / 4) / WAVE_SIZE; + float4 my_q[ITER]; + const float4* q_ptr = queries + qid * (DIM / 4); + + #pragma unroll + for(int i=0; i 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; } + best_dists[pos] = dist; best_ids[pos] = i; + } + } + if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k]; +} + +template +__global__ __launch_bounds__(256) +void kernel_throughput_float(const float* __restrict__ queries, const float* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) { + int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE; + int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id; + if (qid >= batch_size) return; + constexpr int ITER = DIM / WAVE_SIZE; + float my_q[ITER]; + const float* q_ptr = queries + qid * DIM; + #pragma unroll + for(int i=0; i 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; } + best_dists[pos] = dist; best_ids[pos] = i; + } + } + if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k]; +} + +// ---------------- ENGINE WRAPPER ---------------- + +class QingMingEngine { + void *d_database; + void *d_queries; + int *d_results; + int dimension; + int n_base; + size_t max_q_alloc; + +public: + QingMingEngine(py::array_t db_data) { + auto buf = db_data.request(); + n_base = buf.shape[0]; + dimension = buf.shape[1]; + + size_t db_size = n_base * dimension * sizeof(float); + HIP_CHECK(hipMalloc(&d_database, db_size)); + HIP_CHECK(hipMemcpy(d_database, buf.ptr, db_size, hipMemcpyHostToDevice)); + + max_q_alloc = 20000; + HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float))); + HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int))); + } + + ~QingMingEngine() { + hipFree(d_database); + hipFree(d_queries); + hipFree(d_results); + } + + py::array_t query(py::array_t q_data, int k) { + auto buf = q_data.request(); + int n_q = buf.shape[0]; + + // 动态扩容 + if (n_q > max_q_alloc) { + hipFree(d_queries); hipFree(d_results); + max_q_alloc = n_q * 1.5; + HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float))); + HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int))); + } + + HIP_CHECK(hipMemcpy(d_queries, buf.ptr, n_q * dimension * sizeof(float), hipMemcpyHostToDevice)); + + int threads = 256; + int blocks = (n_q * WAVE_SIZE + threads - 1) / threads; + + if (dimension == 128) { + kernel_throughput_float4<128><<>>((float4*)d_queries, (float4*)d_database, n_base, d_results, n_q); + } else if (dimension == 960) { + kernel_throughput_float<960><<>>((float*)d_queries, (float*)d_database, n_base, d_results, n_q); + } else { + // Fallback for other dims if needed, or throw error + throw std::runtime_error("Unsupported Dimension for AMD Kernel"); + } + + HIP_CHECK(hipDeviceSynchronize()); + + std::vector host_res(n_q * TOP_K); + HIP_CHECK(hipMemcpy(host_res.data(), d_results, n_q * TOP_K * sizeof(int), hipMemcpyDeviceToHost)); + + // 截取用户需要的 Top-K + if (k != TOP_K) { + std::vector final_res(n_q * k); + for(int i=0; i({n_q, k}, final_res.data()); + } + + return py::array_t({n_q, TOP_K}, host_res.data()); + } +}; + +PYBIND11_MODULE(qingming_hip, m) { + py::class_(m, "QingMingEngine") + .def(py::init>()) + .def("query", &QingMingEngine::query); +}