Skip to content

Commit c97cd8c

Browse files
committed
[Iluvatar] Support CINN for paddleocr-vl
1 parent acd5638 commit c97cd8c

5 files changed

Lines changed: 55 additions & 42 deletions

File tree

custom_ops/iluvatar_ops/flash_attn_unpadded.cu

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q,
2424
int num_heads,
2525
int head_dim,
2626
int num_kv_heads,
27-
int max_seqlens_q,
28-
int max_seqlens_k,
27+
const paddle::Tensor& max_seqlens_q_,
28+
const paddle::Tensor& max_seqlens_k_,
2929
bool causal,
3030
float scale,
3131
paddle::Tensor& out) {
@@ -148,10 +148,13 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q,
148148
cuinferTensorDescriptor_t lse_desc;
149149
CUINFER_CHECK(cuinferCreateTensorDescriptor(&lse_desc));
150150

151+
const int32_t* max_seqlens_q = max_seqlens_q_.data<int32_t>();
152+
const int32_t* max_seqlens_k = max_seqlens_k_.data<int32_t>();
153+
151154
FmhaFwdFuncArguments args;
152155
args.batch = batch_size;
153-
args.max_seqlen_q = max_seqlens_q;
154-
args.max_seqlen_k = max_seqlens_k;
156+
args.max_seqlen_q = *max_seqlens_q;
157+
args.max_seqlen_k = *max_seqlens_k;
155158
args.is_causal = causal;
156159
args.scaling = scale;
157160
args.window_size_left = -1;
@@ -197,8 +200,8 @@ std::vector<paddle::Tensor> FlashAttnUnpadded(
197200
const paddle::Tensor& v,
198201
const paddle::Tensor& cu_seqlens_q,
199202
const paddle::Tensor& cu_seqlens_k,
200-
int max_seqlens_q,
201-
int max_seqlens_k,
203+
const paddle::Tensor& max_seqlens_q,
204+
const paddle::Tensor& max_seqlens_k,
202205
bool causal,
203206
float scale,
204207
bool training) {
@@ -248,21 +251,31 @@ std::vector<paddle::Tensor> FlashAttnUnpadded(
248251
}
249252

250253
std::vector<std::vector<int64_t>> FlashAttnUnpaddedInferShape(
251-
const std::vector<int64_t>& q_shape) {
254+
const std::vector<int64_t>& q_shape,
255+
const std::vector<int64_t>& k_shape,
256+
const std::vector<int64_t>& v_shape,
257+
const std::vector<int64_t>& cu_seqlens_q_shape,
258+
const std::vector<int64_t>& cu_seqlens_k_shape,
259+
const std::vector<int64_t>& max_seqlens_q_shape,
260+
const std::vector<int64_t>& max_seqlens_k_shape) {
252261
return {{q_shape[0], q_shape[1], q_shape[2]}};
253262
}
254263

255264
std::vector<paddle::DataType> FlashAttnUnpaddedInferDtype(
256-
const paddle::DataType& q_dtype) {
265+
const paddle::DataType& q_dtype,
266+
const paddle::DataType& k_dtype,
267+
const paddle::DataType& v_dtype,
268+
const paddle::DataType& cu_seqlens_q_dtype,
269+
const paddle::DataType& cu_seqlens_v_dtype,
270+
const paddle::DataType& max_seqlens_q_dtype,
271+
const paddle::DataType& max_seqlens_k_dtype) {
257272
return {q_dtype};
258273
}
259274

260275
PD_BUILD_STATIC_OP(cuinfer_flash_attn_unpadded)
261-
.Inputs({"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k"})
276+
.Inputs({"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "max_seqlens_q", "max_seqlens_k"})
262277
.Outputs({"out"})
263-
.Attrs({"max_seqlens_q:int",
264-
"max_seqlens_k:int",
265-
"causal:bool",
278+
.Attrs({"causal:bool",
266279
"scale:float",
267280
"training:bool"})
268281
.SetKernelFn(PD_KERNEL(FlashAttnUnpadded))

custom_ops/setup_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,7 @@ def find_end_files(directory, end_str):
605605
elif paddle.is_compiled_with_xpu():
606606
assert False, "For XPU, please use setup_ops.py in the xpu_ops directory to compile custom ops."
607607
elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
608-
_iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE"]
608+
_iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE", "-std=c++17"]
609609
setup(
610610
name="fastdeploy_ops",
611611
ext_modules=CUDAExtension(

docs/get_started/installation/iluvatar_gpu.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260
2323
### 3.1 Start Container
2424

2525
```bash
26-
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27-
docker exec -it paddle_infer bash
26+
docker run -itd --name fd_iluvatar -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/workspace:/home/workspace -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --shm-size=64G --net=host --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27+
docker exec -it fd_iluvatar bash
2828
```
2929

3030
Note: Because the 4.3.8 SDK in the image is incompatible with KMD, paddle cannot find the iluvatar device. Therefore, it is temporarily necessary to map ixsmi, libcuda.so.1, libixml.so, and libixthunk.so from the host corex-4.3.8 directory into the container.
3131

32-
/home/paddle contains the model files, *.whl packages, and scripts.
32+
/home/workspace contains the model files, *.whl packages, and scripts.
3333

3434
### 3.2 Install paddle
3535

@@ -478,17 +478,17 @@ export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
478478
export FD_SAMPLING_CLASS=rejection
479479
export CUDA_VISIBLE_DEVICES=1
480480
python3 -m fastdeploy.entrypoints.openai.api_server \
481-
--model /data1/fastdeploy/PaddleOCR-VL \
482-
--port 8180 \
483-
--metrics-port 8471 \
484-
--engine-worker-queue-port 8472 \
485-
--cache-queue-port 55660 \
486-
--max-model-len 16384 \
487-
--max-num-batched-tokens 16384 \
488-
--max-num-seqs 64 \
489-
--workers 2 \
490-
--block-size 16 \
491-
--graph-optimization-config '{"use_cudagraph": true}'
481+
--model /data1/fastdeploy/PaddleOCR-VL \
482+
--port 8180 \
483+
--metrics-port 8471 \
484+
--max-model-len 16384 \
485+
--max-num-batched-tokens 16384 \
486+
--max-num-seqs 240 \
487+
--block-size 16 \
488+
--workers 2 \
489+
--gpu-memory-utilization 0.7 \
490+
--graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}'
491+
492492
```
493493

494494
client:

docs/zh/get_started/installation/iluvatar_gpu.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260
2323
### 3.1 启动容器
2424

2525
```bash
26-
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27-
docker exec -it paddle_infer bash
26+
docker run -itd --name fd_iluvatar -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/workspace:/home/workspace -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --shm-size=64G --net=host --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27+
docker exec -it fd_iluvatar bash
2828
```
2929

3030
注意: 由于镜像中的 4.3.8 SDK 与 KMD 不兼容,paddle 无法找到 iluvatar device。因此,暂时需要将宿主机 corex-4.3.8 目录中的 ixsmi、libcuda.so.1、libixml.so 和 libixthunk.so 映射到容器中
3131

32-
/home/paddle 为模型文件、whl包、脚本所在目录。
32+
/home/workspace 为模型文件、whl包、脚本所在目录。
3333

3434
### 3.2 安装paddle
3535

@@ -478,17 +478,17 @@ export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
478478
export FD_SAMPLING_CLASS=rejection
479479
export CUDA_VISIBLE_DEVICES=1
480480
python3 -m fastdeploy.entrypoints.openai.api_server \
481-
--model /data1/fastdeploy/PaddleOCR-VL \
482-
--port 8180 \
483-
--metrics-port 8471 \
484-
--engine-worker-queue-port 8472 \
485-
--cache-queue-port 55660 \
486-
--max-model-len 16384 \
487-
--max-num-batched-tokens 16384 \
488-
--max-num-seqs 64 \
489-
--workers 2 \
490-
--block-size 16 \
491-
--graph-optimization-config '{"use_cudagraph": true}'
481+
--model /data1/fastdeploy/PaddleOCR-VL \
482+
--port 8180 \
483+
--metrics-port 8471 \
484+
--max-model-len 16384 \
485+
--max-num-batched-tokens 16384 \
486+
--max-num-seqs 240 \
487+
--block-size 16 \
488+
--workers 2 \
489+
--gpu-memory-utilization 0.7 \
490+
--graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}'
491+
492492
```
493493
494494
客户端:

scripts/run_ci_iluvatar.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
322322
--max-num-seqs 64 \
323323
--workers 2 \
324324
--block-size 16 \
325-
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
325+
--graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' > server.log 2>&1 &
326326

327327
check_server_status
328328

0 commit comments

Comments
 (0)