Skip to content

Commit e8dd413

Browse files
committed
[Iluvatar] Support CINN for PaddleOCR-VL by converting max_seqlens to Tensor inputs
1 parent 4ba6625 commit e8dd413

9 files changed

Lines changed: 233 additions & 79 deletions

File tree

custom_ops/iluvatar_ops/flash_attn_unpadded.cu

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q,
2424
int num_heads,
2525
int head_dim,
2626
int num_kv_heads,
27-
int max_seqlens_q,
28-
int max_seqlens_k,
27+
const paddle::Tensor& max_seqlens_q_,
28+
const paddle::Tensor& max_seqlens_k_,
2929
bool causal,
3030
float scale,
3131
paddle::Tensor& out) {
@@ -148,10 +148,15 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q,
148148
cuinferTensorDescriptor_t lse_desc;
149149
CUINFER_CHECK(cuinferCreateTensorDescriptor(&lse_desc));
150150

151+
PD_CHECK(max_seqlens_q_.is_cpu(), "max_seqlens_q tensor must be on CPU");
152+
PD_CHECK(max_seqlens_k_.is_cpu(), "max_seqlens_k tensor must be on CPU");
153+
const int32_t* max_seqlens_q = max_seqlens_q_.data<int32_t>();
154+
const int32_t* max_seqlens_k = max_seqlens_k_.data<int32_t>();
155+
151156
FmhaFwdFuncArguments args;
152157
args.batch = batch_size;
153-
args.max_seqlen_q = max_seqlens_q;
154-
args.max_seqlen_k = max_seqlens_k;
158+
args.max_seqlen_q = *max_seqlens_q;
159+
args.max_seqlen_k = *max_seqlens_k;
155160
args.is_causal = causal;
156161
args.scaling = scale;
157162
args.window_size_left = -1;
@@ -197,8 +202,8 @@ std::vector<paddle::Tensor> FlashAttnUnpadded(
197202
const paddle::Tensor& v,
198203
const paddle::Tensor& cu_seqlens_q,
199204
const paddle::Tensor& cu_seqlens_k,
200-
int max_seqlens_q,
201-
int max_seqlens_k,
205+
const paddle::Tensor& max_seqlens_q,
206+
const paddle::Tensor& max_seqlens_k,
202207
bool causal,
203208
float scale,
204209
bool training) {
@@ -248,23 +253,37 @@ std::vector<paddle::Tensor> FlashAttnUnpadded(
248253
}
249254

250255
std::vector<std::vector<int64_t>> FlashAttnUnpaddedInferShape(
251-
const std::vector<int64_t>& q_shape) {
256+
const std::vector<int64_t>& q_shape,
257+
const std::vector<int64_t>& k_shape,
258+
const std::vector<int64_t>& v_shape,
259+
const std::vector<int64_t>& cu_seqlens_q_shape,
260+
const std::vector<int64_t>& cu_seqlens_k_shape,
261+
const std::vector<int64_t>& max_seqlens_q_shape,
262+
const std::vector<int64_t>& max_seqlens_k_shape) {
252263
return {{q_shape[0], q_shape[1], q_shape[2]}};
253264
}
254265

255266
std::vector<paddle::DataType> FlashAttnUnpaddedInferDtype(
256-
const paddle::DataType& q_dtype) {
267+
const paddle::DataType& q_dtype,
268+
const paddle::DataType& k_dtype,
269+
const paddle::DataType& v_dtype,
270+
const paddle::DataType& cu_seqlens_q_dtype,
271+
const paddle::DataType& cu_seqlens_k_dtype,
272+
const paddle::DataType& max_seqlens_q_dtype,
273+
const paddle::DataType& max_seqlens_k_dtype) {
257274
return {q_dtype};
258275
}
259276

260277
PD_BUILD_STATIC_OP(cuinfer_flash_attn_unpadded)
261-
.Inputs({"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k"})
278+
.Inputs({"q",
279+
"k",
280+
"v",
281+
"cu_seqlens_q",
282+
"cu_seqlens_k",
283+
"max_seqlens_q",
284+
"max_seqlens_k"})
262285
.Outputs({"out"})
263-
.Attrs({"max_seqlens_q:int",
264-
"max_seqlens_k:int",
265-
"causal:bool",
266-
"scale:float",
267-
"training:bool"})
286+
.Attrs({"causal:bool", "scale:float", "training:bool"})
268287
.SetKernelFn(PD_KERNEL(FlashAttnUnpadded))
269288
.SetInferShapeFn(PD_INFER_SHAPE(FlashAttnUnpaddedInferShape))
270289
.SetInferDtypeFn(PD_INFER_DTYPE(FlashAttnUnpaddedInferDtype));

custom_ops/setup_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ def find_end_files(directory, end_str):
606606
elif paddle.is_compiled_with_xpu():
607607
assert False, "For XPU, please use setup_ops.py in the xpu_ops directory to compile custom ops."
608608
elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
609-
_iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE"]
609+
_iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE", "-std=c++17"]
610610
setup(
611611
name="fastdeploy_ops",
612612
ext_modules=CUDAExtension(

docs/get_started/installation/iluvatar_gpu.md

Lines changed: 87 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260
2323
### 3.1 Start Container
2424

2525
```bash
26-
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27-
docker exec -it paddle_infer bash
26+
docker run -itd --name fd_iluvatar -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/workspace:/home/workspace -v /usr/local/corex-4.3.8/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex-4.3.8/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex-4.3.8/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex-4.3.8/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --shm-size=64G --net=host --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507
27+
docker exec -it fd_iluvatar bash
2828
```
2929

3030
Note: Because the 4.3.8 SDK in the image is incompatible with KMD, paddle cannot find the iluvatar device. Therefore, it is temporarily necessary to map ixsmi, libcuda.so.1, libixml.so, and libixthunk.so from the host corex-4.3.8 directory into the container.
3131

32-
/home/paddle contains the model files, *.whl packages, and scripts.
32+
/home/workspace contains the model files, *.whl packages, and scripts.
3333

3434
### 3.2 Install paddle
3535

@@ -458,14 +458,9 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
458458
### 4.3 PaddleOCR-VL series
459459
#### 4.3.1 PaddleOCR-VL-0.9B
460460

461-
- (Optional) Build and install paddleocr from source
462-
463-
To install the latest `paddleocr`, you can compile it from source. The version in the image is `3.3.2`.
464-
461+
- install paddleocr
465462
```bash
466-
git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git
467-
cd PaddleOCR
468-
pip3 install -e ".[doc-parser]"
463+
pip3 install paddleocr[doc-parser]==3.3.2
469464
```
470465

471466
Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/PaddleOCR-VL-0.9B.md), the command as bellow:
@@ -478,17 +473,17 @@ export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
478473
export FD_SAMPLING_CLASS=rejection
479474
export CUDA_VISIBLE_DEVICES=1
480475
python3 -m fastdeploy.entrypoints.openai.api_server \
481-
--model /data1/fastdeploy/PaddleOCR-VL \
482-
--port 8180 \
483-
--metrics-port 8471 \
484-
--engine-worker-queue-port 8472 \
485-
--cache-queue-port 55660 \
486-
--max-model-len 16384 \
487-
--max-num-batched-tokens 16384 \
488-
--max-num-seqs 64 \
489-
--workers 2 \
490-
--block-size 16 \
491-
--graph-optimization-config '{"use_cudagraph": true}'
476+
--model /data1/fastdeploy/PaddleOCR-VL \
477+
--port 8180 \
478+
--metrics-port 8471 \
479+
--max-model-len 16384 \
480+
--max-num-batched-tokens 16384 \
481+
--max-num-seqs 240 \
482+
--block-size 16 \
483+
--workers 2 \
484+
--gpu-memory-utilization 0.7 \
485+
--graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}'
486+
492487
```
493488

494489
client:
@@ -508,14 +503,14 @@ The output is:
508503

509504
**benchmark**
510505

511-
1. Download and extract image datasets
506+
1) Download and extract image datasets
512507

513508
```bash
514509
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar
515510
tar xvf images.tar
516511
```
517512

518-
2. Prepare `infer_ocr_vl_benchmark.py`
513+
2) Prepare `infer_ocr_vl_benchmark.py`
519514

520515
```python
521516
import os
@@ -532,13 +527,80 @@ for file_name in file_list:
532527
res.save_to_markdown(save_path="output", pretty=False)
533528
```
534529

535-
3. execute `infer_ocr_vl_benchmark.py` on client
530+
3) execute `infer_ocr_vl_benchmark.py` on client
536531

537532
```bash
538533
python3 infer_ocr_vl_benchmark.py
539534
```
540535

541-
After each image is inferred, a corresponding `md` file will be generated in the `output` path. Running the entire benchmark (1355 images) takes approximately 1.8 hours.
536+
#### 4.3.2 PaddleOCR-VL-1.6-0.9B
537+
538+
- install paddleocr
539+
540+
```bash
541+
pip3 install paddleocr[doc-parser]==3.6.0
542+
```
543+
544+
server:
545+
```bash
546+
#!/bin/bash
547+
export PADDLE_XCCL_BACKEND=iluvatar_gpu
548+
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
549+
export FD_SAMPLING_CLASS=rejection
550+
export CUDA_VISIBLE_DEVICES=1
551+
python3 -m fastdeploy.entrypoints.openai.api_server \
552+
--model /data1/fastdeploy/PaddleOCR-VL-1.6 \
553+
--port 8180 \
554+
--metrics-port 8471 \
555+
--max-model-len 16384 \
556+
--max-num-batched-tokens 16384 \
557+
--max-num-seqs 240 \
558+
--block-size 16 \
559+
--workers 2 \
560+
--gpu-memory-utilization 0.7 \
561+
--graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}'
562+
563+
```
564+
565+
client:
566+
567+
**simple demo**
568+
569+
```bash
570+
paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png --vl_rec_backend fastdeploy-server --vl_rec_server_url http://127.0.0.1:8180/v1 --device iluvatar_gpu --pipeline_version v1.6
571+
```
572+
573+
**benchmark**
574+
575+
1) Download and extract image datasets
576+
577+
```bash
578+
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar
579+
tar xvf images.tar
580+
```
581+
582+
2) Prepare `infer_ocr_vl_benchmark.py`
583+
584+
```python
585+
import os
586+
from paddleocr import PaddleOCRVL
587+
588+
input_path = "./images"
589+
pipeline = PaddleOCRVL(vl_rec_backend="fastdeploy-server", vl_rec_server_url="http://127.0.0.1:8180/v1", device="iluvatar_gpu", pipeline_version="v1.6")
590+
file_list = os.listdir(input_path)
591+
for file_name in file_list:
592+
file_path = os.path.join(input_path, file_name)
593+
output = pipeline.predict(file_path)
594+
for res in output:
595+
res.print()
596+
res.save_to_markdown(save_path="output", pretty=False)
597+
```
598+
599+
3) execute `infer_ocr_vl_benchmark.py` on client
600+
601+
```bash
602+
python3 infer_ocr_vl_benchmark.py
603+
```
542604

543605
## 5. Quantization Format Support
544606
- `W8A16`: `--quantization wint8`

0 commit comments

Comments
 (0)