Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jobs:
- redisearch
- qdrant
- qg_ngt
- qingming
- qsg_ngt
- scann
- sptag
Expand Down
33 changes: 33 additions & 0 deletions ann_benchmarks/algorithms/qingming/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# 使用 AMD 官方镜像
FROM rocm/dev-ubuntu-22.04:6.2

LABEL maintainer="qingming-flat"

# 安装依赖
RUN sudo apt-get update && sudo apt-get install -y python3-pip git

# 安装 Python 库
RUN pip3 install numpy h5py pybind11

# [SABO FIX: 关键路径修正]
# 1. 先创建完整的 Python 包目录结构
WORKDIR /home/app/ann_benchmarks/algorithms/qingming

# 2. 把当前目录的所有文件(cpp, py, yml) 复制进这个深层目录
COPY . .

# 3. 原地编译
# 现在的上下文就在 .../qingming 下,hipcc 能找到文件
RUN hipcc -O3 -shared -fPIC -std=c++17 \
--offload-arch=gfx1100 \
-x hip qingming_hip.cpp \
$(python3 -m pybind11 --includes) \
-o qingming_hip$(python3-config --extension-suffix)

# 4. 回到根目录准备运行
WORKDIR /home/app

# 5. [CRITICAL] 确保 Python 能从 /home/app 找到包
ENV PYTHONPATH=/home/app

ENTRYPOINT ["python3", "-u", "run.py"]
1 change: 1 addition & 0 deletions ann_benchmarks/algorithms/qingming/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

13 changes: 13 additions & 0 deletions ann_benchmarks/algorithms/qingming/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
float:
any:
- base_args: ['@metric']
constructor: QingMing
disabled: false
docker_tag: ann-benchmarks-qingming
# [CORRECTION] 指向包目录,而非文件
module: ann_benchmarks.algorithms.qingming
name: qingming
run_groups:
base:
args:
k: [100]
53 changes: 53 additions & 0 deletions ann_benchmarks/algorithms/qingming/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np
import ann_benchmarks.algorithms.qingming.qingming_hip as engine
from ann_benchmarks.algorithms.base.module import BaseANN

class QingMing(BaseANN):
def __init__(self, metric, method_param):
self.metric = metric
self.k = method_param.get("k", 100)
# 必须与 config.yml 的 name 一致
self.name = "qingming"
# [Strict Protocol] 初始化结果容器
self.res = []

def fit(self, X):
print(f"[{self.name}] Indexing {X.shape} on AMD GPU...")

if self.metric == 'angular':
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms[norms == 0] = 1.0
X = X / norms

self._data = np.array(X, dtype=np.float32, order='C')
self.engine = engine.QingMingEngine(self._data)

def set_query_arguments(self, k):
self.k = k

def batch_query(self, X, n):
# [Strict Protocol] 只计算,不返回,存入 self.res
if self.metric == 'angular':
norms = np.linalg.norm(X, axis=1, keepdims=True)
norms[norms == 0] = 1.0
X = X / norms

X = np.array(X, dtype=np.float32, order='C')

# C++ 返回的是 numpy array
# 官方框架有时需要 list of list,但 numpy array 通常也兼容
# 为了稳妥,如果你之前的 .so 返回的是 numpy array,这里直接赋值
self.res = self.engine.query(X, n)

# get_batch_results 使用基类的默认实现即可(它就是 return self.res)
# 所以不需要重写它

def query(self, v, n):
# Latency 模式
if self.metric == 'angular':
v = v / np.linalg.norm(v)
v_batch = np.array([v], dtype=np.float32)
return self.engine.query(v_batch, n)[0]

def __str__(self):
return self.name
181 changes: 181 additions & 0 deletions ann_benchmarks/algorithms/qingming/qingming_hip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*
* QINGMING-ENGINE: AMD RDNA3 EDITION (Navi 31)
* WRAPPER: PyBind11 for ANN-Benchmarks
*/

#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>

namespace py = pybind11;

#define TOP_K 100
#define WAVE_SIZE 32
#define THREADS_PER_BLOCK 256

#define HIP_CHECK(call) do { \
hipError_t err = call; \
if (err != hipSuccess) { \
throw std::runtime_error(std::string("HIP Error: ") + hipGetErrorString(err)); \
} \
} while (0)

__device__ inline float wave_reduce_sum(float val) {
for (int offset = WAVE_SIZE / 2; offset > 0; offset /= 2) {
val += __shfl_down(val, offset);
}
return val;
}

// ---------------- KERNELS (保留你的原始逻辑) ----------------

template <int DIM>
__global__ __launch_bounds__(256)
void kernel_throughput_float4(const float4* __restrict__ queries, const float4* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) {
int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE;
int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id;
if (qid >= batch_size) return;

constexpr int ITER = (DIM / 4) / WAVE_SIZE;
float4 my_q[ITER];
const float4* q_ptr = queries + qid * (DIM / 4);

#pragma unroll
for(int i=0; i<ITER; ++i) my_q[i] = q_ptr[lane_id + i*WAVE_SIZE];

float best_dists[TOP_K]; int best_ids[TOP_K];
if (lane_id == 0) for(int k=0; k<TOP_K; ++k) { best_dists[k] = 1e30f; best_ids[k] = -1; }

for (int i = 0; i < n_base; ++i) {
const float4* vec_ptr = database + i * (DIM / 4);
float dist = 0.0f;
#pragma unroll
for(int k=0; k<ITER; ++k) {
float4 v = vec_ptr[lane_id + k*WAVE_SIZE];
dist += (my_q[k].x - v.x)*(my_q[k].x - v.x) + (my_q[k].y - v.y)*(my_q[k].y - v.y) + (my_q[k].z - v.z)*(my_q[k].z - v.z) + (my_q[k].w - v.w)*(my_q[k].w - v.w);
}
dist = wave_reduce_sum(dist);
if (lane_id == 0 && dist < best_dists[TOP_K-1]) {
int pos = TOP_K - 1; while (pos > 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; }
best_dists[pos] = dist; best_ids[pos] = i;
}
}
if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k];
}

template <int DIM>
__global__ __launch_bounds__(256)
void kernel_throughput_float(const float* __restrict__ queries, const float* __restrict__ database, int n_base, int* __restrict__ out_ids, int batch_size) {
int wave_id = threadIdx.x / WAVE_SIZE; int lane_id = threadIdx.x % WAVE_SIZE;
int qid = blockIdx.x * (blockDim.x / WAVE_SIZE) + wave_id;
if (qid >= batch_size) return;
constexpr int ITER = DIM / WAVE_SIZE;
float my_q[ITER];
const float* q_ptr = queries + qid * DIM;
#pragma unroll
for(int i=0; i<ITER; ++i) my_q[i] = q_ptr[lane_id + i*WAVE_SIZE];
float best_dists[TOP_K]; int best_ids[TOP_K];
if (lane_id == 0) for(int k=0; k<TOP_K; ++k) { best_dists[k] = 1e30f; best_ids[k] = -1; }
for (int i = 0; i < n_base; ++i) {
const float* vec_ptr = database + i * DIM;
float dist = 0.0f;
#pragma unroll
for(int k=0; k<ITER; ++k) {
float v = vec_ptr[lane_id + k*WAVE_SIZE]; float diff = my_q[k] - v; dist += diff * diff;
}
dist = wave_reduce_sum(dist);
if (lane_id == 0 && dist < best_dists[TOP_K-1]) {
int pos = TOP_K - 1; while (pos > 0 && best_dists[pos-1] > dist) { best_dists[pos] = best_dists[pos-1]; best_ids[pos] = best_ids[pos-1]; pos--; }
best_dists[pos] = dist; best_ids[pos] = i;
}
}
if (lane_id == 0) for (int k = 0; k < TOP_K; ++k) out_ids[qid * TOP_K + k] = best_ids[k];
}

// ---------------- ENGINE WRAPPER ----------------

class QingMingEngine {
void *d_database;
void *d_queries;
int *d_results;
int dimension;
int n_base;
size_t max_q_alloc;

public:
QingMingEngine(py::array_t<float> db_data) {
auto buf = db_data.request();
n_base = buf.shape[0];
dimension = buf.shape[1];

size_t db_size = n_base * dimension * sizeof(float);
HIP_CHECK(hipMalloc(&d_database, db_size));
HIP_CHECK(hipMemcpy(d_database, buf.ptr, db_size, hipMemcpyHostToDevice));

max_q_alloc = 20000;
HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float)));
HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int)));
}

~QingMingEngine() {
hipFree(d_database);
hipFree(d_queries);
hipFree(d_results);
}

py::array_t<int> query(py::array_t<float> q_data, int k) {
auto buf = q_data.request();
int n_q = buf.shape[0];

// 动态扩容
if (n_q > max_q_alloc) {
hipFree(d_queries); hipFree(d_results);
max_q_alloc = n_q * 1.5;
HIP_CHECK(hipMalloc(&d_queries, max_q_alloc * dimension * sizeof(float)));
HIP_CHECK(hipMalloc(&d_results, max_q_alloc * TOP_K * sizeof(int)));
}

HIP_CHECK(hipMemcpy(d_queries, buf.ptr, n_q * dimension * sizeof(float), hipMemcpyHostToDevice));

int threads = 256;
int blocks = (n_q * WAVE_SIZE + threads - 1) / threads;

if (dimension == 128) {
kernel_throughput_float4<128><<<blocks, threads>>>((float4*)d_queries, (float4*)d_database, n_base, d_results, n_q);
} else if (dimension == 960) {
kernel_throughput_float<960><<<blocks, threads>>>((float*)d_queries, (float*)d_database, n_base, d_results, n_q);
} else {
// Fallback for other dims if needed, or throw error
throw std::runtime_error("Unsupported Dimension for AMD Kernel");
}

HIP_CHECK(hipDeviceSynchronize());

std::vector<int> host_res(n_q * TOP_K);
HIP_CHECK(hipMemcpy(host_res.data(), d_results, n_q * TOP_K * sizeof(int), hipMemcpyDeviceToHost));

// 截取用户需要的 Top-K
if (k != TOP_K) {
std::vector<int> final_res(n_q * k);
for(int i=0; i<n_q; ++i) {
for(int j=0; j<k; ++j) {
final_res[i*k + j] = host_res[i*TOP_K + j];
}
}
return py::array_t<int>({n_q, k}, final_res.data());
}

return py::array_t<int>({n_q, TOP_K}, host_res.data());
}
};

PYBIND11_MODULE(qingming_hip, m) {
py::class_<QingMingEngine>(m, "QingMingEngine")
.def(py::init<py::array_t<float>>())
.def("query", &QingMingEngine::query);
}