diff --git a/.gitignore b/.gitignore index b5c4e1a5c00..ccc56106952 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ inference_results/ output/ train_data/ log/ + *.DS_Store *.vs *.user @@ -26,9 +27,14 @@ log/ build/ dist/ *.egg-info/ + /deploy/android_demo/app/OpenCV/ /deploy/android_demo/app/PaddleLite/ /deploy/android_demo/app/.cxx/ /deploy/android_demo/app/cache/ + test_tipc/web/models/ test_tipc/web/node_modules/ + +.venv/ +.worktrees/ diff --git a/deploy/paddleocr_vl_docker/build_pipeline.sh b/deploy/paddleocr_vl_docker/build_pipeline.sh index 6c322e73856..9daf8ec1959 100755 --- a/deploy/paddleocr_vl_docker/build_pipeline.sh +++ b/deploy/paddleocr_vl_docker/build_pipeline.sh @@ -2,6 +2,9 @@ set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/pip_version_arg.sh" + device_type='nvidia-gpu' build_for_offline='false' paddleocr_version='3.4.0' @@ -18,8 +21,8 @@ Usage: $(basename "$0") [OPTIONS] Options: --device-type Device type (nvidia-gpu|nvidia-gpu-sm120|hygon-dcu|kunlunxin-xpu|metax-gpu|iluvatar-gpu|huawei-npu|intel-gpu|amd-gpu) [default: ${device_type}] --offline Build offline version - --ppocr-version PaddleOCR version [default: ${paddleocr_version}] - --pdx-version PaddleX version [default: ${paddlex_version}] + --ppocr-version PaddleOCR version or URL [default: ${paddleocr_version}] + --pdx-version PaddleX version or URL [default: ${paddlex_version}] --platform Build platform [default: ${platform}] --action Post-build action: load|push|tar|none [default: ${action}] load: Load to local Docker @@ -34,6 +37,7 @@ Examples: $0 --device-type nvidia-gpu --action push $0 --device-type nvidia-gpu --platform linux/amd64,linux/arm64 --action push $0 --device-type nvidia-gpu --action tar --platform linux/amd64 + $0 --pdx-version git+https://github.com/PaddlePaddle/PaddleX.git@main EOF } @@ -133,6 +137,9 @@ while [[ $# -gt 0 ]]; do esac done +paddleocr_pip_suffix="$(pip_build_arg_suffix "${paddleocr_version}")" +paddlex_pip_suffix="$(pip_build_arg_suffix "${paddlex_version}")" + # Validate platform compatibility for load action if [[ "${action}" == 'load' ]] && [[ "${platform}" == *','* ]]; then echo "Error: Cannot use --action load with multiple platforms" >&2 @@ -155,7 +162,7 @@ if [ ! -f "${dockerfile}" ]; then fi revision="$(git rev-parse --short HEAD)" -image_version="${revision}-ppocr${paddleocr_version}-pdx${paddlex_version}" +image_version="${revision}-ppocr$(version_tag_label "${paddleocr_version}")-pdx$(version_tag_label "${paddlex_version}")" # Image name base_image_name='paddleocr-vl' @@ -163,7 +170,7 @@ base_image_name='paddleocr-vl' # Main tags main_tag="${registry}/${base_image_name}:${tag_suffix}" version_tag="${registry}/${base_image_name}:${tag_suffix/latest/${image_version}}" -paddleocr_version_tag="${registry}/${base_image_name}:${tag_suffix/latest/paddleocr${paddleocr_version%.*}}" +paddleocr_version_tag="${registry}/${base_image_name}:${tag_suffix/latest/paddleocr$(version_tag_label "${paddleocr_version}")}" # Build arguments array build_args=( @@ -173,8 +180,8 @@ build_args=( '-t' "${version_tag}" '-t' "${paddleocr_version_tag}" '--build-arg' "BUILD_FOR_OFFLINE=${build_for_offline}" - '--build-arg' "PADDLEOCR_VERSION===${paddleocr_version}" - '--build-arg' "PADDLEX_VERSION===${paddlex_version}" + '--build-arg' "PADDLEOCR_VERSION=${paddleocr_pip_suffix}" + '--build-arg' "PADDLEX_VERSION=${paddlex_pip_suffix}" '--build-arg' "http_proxy=${http_proxy:-}" '--build-arg' "https_proxy=${https_proxy:-}" '--build-arg' "no_proxy=${no_proxy:-}" diff --git a/deploy/paddleocr_vl_docker/build_vlm.sh b/deploy/paddleocr_vl_docker/build_vlm.sh index d7f9bed1cf7..554ed8857d7 100755 --- a/deploy/paddleocr_vl_docker/build_vlm.sh +++ b/deploy/paddleocr_vl_docker/build_vlm.sh @@ -2,6 +2,9 @@ set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/pip_version_arg.sh" + device_type='nvidia-gpu' backend='vllm' build_for_offline='false' @@ -20,8 +23,8 @@ Options: --device-type Device type (nvidia-gpu|nvidia-gpu-sm120|hygon-dcu|kunlunxin-xpu|metax-gpu|iluvatar-gpu|huawei-npu|intel-gpu|amd-gpu) [default: ${device_type}] --backend Backend type (vllm|fastdeploy) [default: ${backend}] --offline Build offline version - --ppocr-version PaddleOCR version [default: ${paddleocr_version}] - --pdx-version PaddleX version [default: ${paddlex_version}] + --ppocr-version PaddleOCR version or URL [default: ${paddleocr_version}] + --pdx-version PaddleX version or URL [default: ${paddlex_version}] --platform Build platform [default: ${platform}] --action Post-build action: load|push|tar|none [default: ${action}] load: Load to local Docker @@ -36,6 +39,7 @@ Examples: $0 --device-type nvidia-gpu --backend vllm --action push $0 --platform linux/amd64,linux/arm64 --action push $0 --action tar --platform linux/amd64 + $0 --ppocr-version https://github.com/PaddlePaddle/PaddleOCR/archive/main.zip EOF } @@ -151,6 +155,9 @@ while [[ $# -gt 0 ]]; do esac done +paddleocr_pip_suffix="$(pip_build_arg_suffix "${paddleocr_version}")" +paddlex_pip_suffix="$(pip_build_arg_suffix "${paddlex_version}")" + # Validate platform compatibility for load action if [[ "${action}" == 'load' ]] && [[ "${platform}" == *','* ]]; then echo "Error: Cannot use --action load with multiple platforms" >&2 @@ -173,7 +180,7 @@ if [ ! -f "${dockerfile}" ]; then fi revision="$(git rev-parse --short HEAD)" -image_version="${revision}-ppocr${paddleocr_version}-pdx${paddlex_version}" +image_version="${revision}-ppocr$(version_tag_label "${paddleocr_version}")-pdx$(version_tag_label "${paddlex_version}")" # Image name base_image_name="paddleocr-genai-${backend}-server" @@ -181,7 +188,7 @@ base_image_name="paddleocr-genai-${backend}-server" # Main tags main_tag="${registry}/${base_image_name}:${tag_suffix}" version_tag="${registry}/${base_image_name}:${tag_suffix/latest/${image_version}}" -paddleocr_version_tag="${registry}/${base_image_name}:${tag_suffix/latest/paddleocr${paddleocr_version%.*}}" +paddleocr_version_tag="${registry}/${base_image_name}:${tag_suffix/latest/paddleocr$(version_tag_label "${paddleocr_version}")}" # Build arguments array build_args=( @@ -191,8 +198,8 @@ build_args=( '-t' "${version_tag}" '-t' "${paddleocr_version_tag}" '--build-arg' "BUILD_FOR_OFFLINE=${build_for_offline}" - '--build-arg' "PADDLEOCR_VERSION===${paddleocr_version}" - '--build-arg' "PADDLEX_VERSION===${paddlex_version}" + '--build-arg' "PADDLEOCR_VERSION=${paddleocr_pip_suffix}" + '--build-arg' "PADDLEX_VERSION=${paddlex_pip_suffix}" '--build-arg' "BACKEND=${backend}" '--build-arg' "http_proxy=${http_proxy:-}" '--build-arg' "https_proxy=${https_proxy:-}" diff --git a/deploy/paddleocr_vl_docker/pip_version_arg.sh b/deploy/paddleocr_vl_docker/pip_version_arg.sh new file mode 100644 index 00000000000..6aea7e7bc2b --- /dev/null +++ b/deploy/paddleocr_vl_docker/pip_version_arg.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +pip_build_arg_suffix() { + local v="$1" + [[ -z "$v" ]] && { + echo "" + return + } + if [[ "$v" == '=='* ]]; then + echo "$v" + return + fi + if [[ "$v" == '>='* ]] || [[ "$v" == '<='* ]] || [[ "$v" == '~='* ]] || [[ "$v" == '!='* ]]; then + echo "$v" + return + fi + if [[ "$v" == '<'* ]] || [[ "$v" == '>'* ]]; then + echo "$v" + return + fi + if [[ "$v" =~ ^(https?://|git\+|file://) ]]; then + echo " @${v}" + return + fi + if [[ "$v" == @* ]]; then + echo " @${v#@}" + return + fi + echo "==${v}" +} + +version_tag_label() { + local v="$1" + if [[ "$v" =~ ^(https?://|git\+|file://) ]] || [[ "$v" == @* ]]; then + if command -v shasum >/dev/null 2>&1; then + printf 'u%s' "$(printf '%s' "$v" | shasum -a 256 | cut -c1-8)" + elif command -v sha256sum >/dev/null 2>&1; then + printf 'u%s' "$(printf '%s' "$v" | sha256sum | cut -c1-8)" + else + printf 'u%s' "$(printf '%s' "$v" | cksum | cut -c1-8)" + fi + else + local s="$v" + [[ "$s" == '=='* ]] && s="${s#==}" + echo "${s%.*}" + fi +} diff --git a/docs/version3.x/deployment/high_performance_inference.en.md b/docs/version3.x/deployment/high_performance_inference.en.md index baddce26287..7c1f28af7f7 100644 --- a/docs/version3.x/deployment/high_performance_inference.en.md +++ b/docs/version3.x/deployment/high_performance_inference.en.md @@ -16,6 +16,8 @@ This document primarily introduces the installation and usage methods for high-p ### 1.1 Install High-Performance Inference Dependencies +The high-performance inference functionality depends on the PaddlePaddle framework, so before proceeding to the subsequent steps, you need to ensure that PaddlePaddle is installed in your environment. + Install the dependencies required for high-performance inference using the PaddleOCR CLI: ```bash diff --git a/docs/version3.x/deployment/high_performance_inference.md b/docs/version3.x/deployment/high_performance_inference.md index 16166e0842b..5fa471ee985 100644 --- a/docs/version3.x/deployment/high_performance_inference.md +++ b/docs/version3.x/deployment/high_performance_inference.md @@ -16,6 +16,8 @@ comments: true ## 1.1 安装高性能推理依赖 +高性能推理功能依赖于飞桨框架,因此,在进行后续步骤前,您需要确保环境中安装有飞桨框架。 + 通过 PaddleOCR CLI 安装高性能推理所需依赖: ```bash diff --git a/docs/version3.x/inference_engine.en.md b/docs/version3.x/inference_engine.en.md new file mode 100644 index 00000000000..59c9cb42e7b --- /dev/null +++ b/docs/version3.x/inference_engine.en.md @@ -0,0 +1,201 @@ +--- +comments: true +--- + +# Inference Engine and Configuration + +PaddleOCR 3.5 introduces a unified inference-engine configuration mechanism: use `engine` to select the underlying inference engine, and use `engine_config` to pass engine-specific settings. This mechanism applies to both individual models and pipelines. + +If `engine` is not explicitly specified, the default behavior remains the same as in earlier versions: except for a few scenarios such as high-performance inference and generative AI client request features, PaddleOCR uses the PaddlePaddle framework for inference in most cases. If `engine` is explicitly specified, initialization follows the selected engine first. + +## 1. What Is an Inference Engine + +In PaddleOCR, an inference engine refers to the underlying runtime used to execute a model. It determines which runtime loads and runs the model. You can think of it as "the engine actually used during model inference." When using an inference engine, users usually only need to care about two things: + +- which type of inference engine to use; +- how to configure the inference engine. + +## 2. Inference Engines Currently Supported by PaddleOCR + +| Engine category | `engine` values | Description | +| - | - | - | +| PaddlePaddle framework | `paddle`, `paddle_static`, `paddle_dynamic` | Runs on the PaddlePaddle framework. | +| Transformers | `transformers` | Runs on Hugging Face Transformers. | + +- `paddle`: The unified entry point for the PaddlePaddle framework. It selects `paddle_static` or `paddle_dynamic` according to the model type and files in the model directory. If both are available, `paddle_static` is preferred. +- `paddle_static`: PaddlePaddle static-graph inference, suitable for scenarios that require better inference performance or more fine-grained performance tuning. +- `paddle_dynamic`: PaddlePaddle dynamic-graph inference, which is more flexible and easier to debug compared to static graph. +- `transformers`: Hugging Face Transformers inference, making it convenient to integrate with the Hugging Face ecosystem. + +## 3. Installation by Inference Engine + +### 3.1 PaddlePaddle framework + +When using the PaddlePaddle framework for inference, you need to install PaddlePaddle first. For installation instructions, see [PaddlePaddle Framework Installation](./paddlepaddle_installation.en.md). + +### 3.2 Transformers + +When using Transformers as the inference engine, you need to install Hugging Face Transformers. Example command: + +```bash +python -m pip install "transformers>=5.4.0" +``` + +In many cases, you also need to install the underlying inference framework. For details, see the [Transformers official documentation](https://huggingface.co/docs/transformers/installation). + +## 4. Configuration and Supported Values of `engine` and `engine_config` + +### 4.1 `engine` + +`engine` is used to specify the inference engine. Supported values are: + +| Value | Meaning | Description | +| - | - | - | +| `None` | No explicit engine specified | Automatically determines the inference engine. Keep the behavior of PaddleOCR 3.4; in most cases, the PaddlePaddle framework will be used for inference. | +| `paddle` | Unified PaddlePaddle framework entry | Automatically selects `paddle_static` or `paddle_dynamic`. | +| `paddle_static` | Static-graph inference | Uses Paddle static-graph inference. | +| `paddle_dynamic` | Dynamic-graph inference | Uses Paddle dynamic-graph inference. | +| `transformers` | Transformers inference | Uses Hugging Face Transformers inference. | + +### 4.2 `engine_config` + +`engine_config` is used to configure the inference engine and is recommended to be used together with `engine`. Common `engine_config` fields for each engine are listed below: + +#### `paddle_static` + +Common fields include: + +- `run_mode`: execution mode, such as `paddle`, `trt_fp32`, `trt_fp16`, and `mkldnn`; +- `device_type` / `device_id`: device type and device index; +- `cpu_threads`: number of CPU inference threads; +- `delete_pass`: list of graph optimization passes to disable manually; +- `enable_new_ir`: whether to enable the new IR; +- `enable_cinn`: whether to enable CINN; +- `trt_cfg_setting`: low-level TensorRT configuration; +- `trt_use_dynamic_shapes`: whether to enable TensorRT dynamic shapes; +- `trt_collect_shape_range_info`: whether to collect shape range information; +- `trt_discard_cached_shape_range_info`: whether to discard existing shape range information and recollect it; +- `trt_dynamic_shapes`: dynamic shape configuration; +- `trt_dynamic_shape_input_data`: data used to fill input tensors when collecting dynamic shapes; +- `trt_shape_range_info_path`: path to the shape range information file; +- `trt_allow_rebuild_at_runtime`: whether rebuilding the TensorRT engine is allowed at runtime; +- `mkldnn_cache_capacity`: oneDNN (MKLDNN) cache capacity. + +#### `paddle_dynamic` + +Common fields include: + +- `device_type` / `device_id`: device type and device index used during dynamic-graph execution. + +#### `transformers` + +Common fields include: + +- `dtype`: data type used for model weights / inference, such as `float16`; +- `device_type` / `device_id`: inference device type and device index; +- `trust_remote_code`: whether to trust and execute custom code in model repositories; +- `attn_implementation`: attention implementation method, such as `flash_attention_2`; +- `generation_config`: generation parameters, such as `max_new_tokens` and `temperature`; +- `model_kwargs`: extra arguments passed to the model loading API; +- `processor_kwargs`: extra arguments passed to the processor / image processor loading API; +- `tokenizer_kwargs`: a compatibility-preserved field that is merged with `processor_kwargs`. + +#### 4.2.1 Flat vs. bucketed `engine_config` + +At the same level `engine_config` may be: + +- **Flat**: a dict whose keys are only those required by the **resolved** engine (for example, when using static graph only, top-level keys such as `run_mode` and `cpu_threads`). +- **Bucketed**: top-level keys are **only** registered engine names (e.g. `paddle_static`, `paddle_dynamic`, `transformers`), each mapping to a nested dict. You **must not** mix bucket keys with flat keys at the same level (e.g. `{"paddle_static": {...}, "run_mode": "paddle"}` is invalid). + +When an engine is resolved, only the corresponding config is used: flat configs are validated as a whole; bucketed configs take the entry for that engine. + +### 4.3 Priority and Override Rules + +- For pipelines, `engine` and `engine_config` passed through CLI arguments or Python API initialization arguments take precedence over fields with the same names in the pipeline configuration file. +- In pipeline configuration files, top-level `engine` and `engine_config` act as global settings; `engine` and `engine_config` in submodules or sub-pipelines can override upper-level settings. +- For more complete rules about priority, overriding, and pipeline configuration behavior, refer to the PaddleX documentation: [PaddleX Pipeline Python API Usage](https://paddlepaddle.github.io/PaddleX/latest/en/pipeline_usage/instructions/pipeline_python_API.html). + +### 4.4 Compatibility Rules + +- When `engine` is explicitly set, `enable_hpi` no longer takes effect. +- When `engine_config` is explicitly provided, compatibility arguments for the selected engine are ignored. For example, in `paddle` / `paddle_static` scenarios, compatibility arguments such as `use_tensorrt`, `precision`, `enable_mkldnn`, `mkldnn_cache_capacity`, `cpu_threads`, and `enable_cinn` no longer take effect. + +## 5. Usage Examples + +### 5.1 Individual model (CLI): select the engine with `--engine` + +```bash +paddleocr text_detection -i general_ocr_001.png --engine transformers +``` + +### 5.2 Individual model (Python): explicitly specify `transformers` + +```python +from paddleocr import TextDetection + +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="transformers", +) + +result = model.predict("general_ocr_001.png") +``` + +### 5.3 Individual model (Python): specify `paddle_static` and `engine_config` + +```python +from paddleocr import TextDetection + +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="paddle_static", + engine_config={ + "device_type": "cpu", + "cpu_threads": 4, + "run_mode": "mkldnn", + }, +) + +result = model.predict("general_ocr_001.png") +``` + +### 5.4 Pipeline (CLI): select the engine with `--engine` + +```bash +paddleocr ocr -i general_ocr_001.png --engine paddle_static +``` + +### 5.5 Pipeline (Python API): configure the inference engine for a specific module + +If you want to specify `engine` and `engine_config` for a specific module inside a pipeline, you can first export the configuration file, modify the corresponding module configuration, and then load it. For how to export, edit, and load the configuration file, see [Using PaddleX Pipeline Configuration Files](./paddleocr_and_paddlex.en.md#3-using-paddlex-pipeline-configuration-files). Example: + +First, export the pipeline configuration file: + +```python +from paddleocr import PaddleOCR + +pipeline = PaddleOCR() +pipeline.export_paddlex_config_to_yaml("ocr_config.yaml") +``` + +Then, set `engine` and `engine_config` specifically for the `TextDetection` module in `ocr_config.yaml`: + +```yaml +pipeline_name: OCR +SubModules: + TextDetection: + engine: paddle_static + engine_config: + device_type: cpu + cpu_threads: 4 + run_mode: mkldnn +``` + +Use the updated configuration file for inference: + +```python +from paddleocr import PaddleOCR + +pipeline = PaddleOCR(paddlex_config="ocr_config.yaml") +result = pipeline.predict("general_ocr_001.png") +``` diff --git a/docs/version3.x/inference_engine.md b/docs/version3.x/inference_engine.md new file mode 100644 index 00000000000..81d077cc974 --- /dev/null +++ b/docs/version3.x/inference_engine.md @@ -0,0 +1,201 @@ +--- +comments: true +--- + +# 推理引擎与配置说明 + +PaddleOCR 3.5 引入了统一的推理引擎配置方式:使用 `engine` 选择底层推理引擎,使用 `engine_config` 传递该引擎的专属配置。无论是单模型还是产线,均可按这套方式声明推理行为。 + +如果不显式指定 `engine`,默认行为与旧版本保持一致:除高性能推理功能和生成式 AI 客户端请求功能等少数场景外,大多数情况下会优先使用飞桨框架推理;如果显式指定 `engine`,则会优先按指定引擎初始化。 + +## 1. 什么是推理引擎 + +在 PaddleOCR 中,推理引擎指模型执行时所使用的底层运行时。它决定了模型由哪套运行时加载与执行,可以将它理解为“模型推理时实际使用的引擎”。在使用推理引擎时,用户通常只需要关心两件事: + +- 选择哪类推理引擎; +- 如何配置推理引擎。 + +## 2. PaddleOCR 当前支持的推理引擎 + +| 引擎类别 | `engine` 取值 | 说明 | +| - | - | - | +| 飞桨框架 | `paddle`、`paddle_static`、`paddle_dynamic` | 基于飞桨框架运行。 | +| Transformers | `transformers` | 基于 Hugging Face Transformers 运行。 | + +- `paddle`:飞桨框架统一入口。根据模型类型和模型目录中的文件选择 `paddle_static` 或 `paddle_dynamic`,在二者都可用的情况下偏好 `paddle_static`。 +- `paddle_static`:飞桨静态图推理,适合对推理性能有一定要求或者需要进行精细化推理性能调优的场景。 +- `paddle_dynamic`:飞桨动态图推理,相比静态图更加灵活、易于调试。 +- `transformers`:Hugging Face Transformers 推理,便于与 Hugging Face 生态集成。 + +## 3. 各推理引擎安装方式 + +### 3.1 飞桨框架 + +当您使用飞桨框架进行推理时,需要先安装飞桨框架。安装方法请参考[飞桨框架安装](./paddlepaddle_installation.md)。 + +### 3.2 Transformers + +当您使用 Transformers 作为推理引擎时,需要安装 Hugging Face Transformers。示例命令如下: + +```bash +python -m pip install "transformers>=5.4.0" +``` + +通常,您还需要安装底层推理框架,详情可参考 [Transformers 官方文档](https://huggingface.co/docs/transformers/installation)。 + +## 4. `engine` 和 `engine_config` 的设置与取值 + +### 4.1 `engine` + +`engine` 用于指定推理引擎,可取值如下: + +| 取值 | 含义 | 说明 | +| - | - | - | +| `None` | 不显式指定引擎 | 自动确定推理引擎。保持 PaddleOCR 3.4 的行为,大多数情况下会使用飞桨框架推理。 | +| `paddle` | 飞桨框架统一入口 | 自动选择 `paddle_static` 或 `paddle_dynamic`。 | +| `paddle_static` | 静态图推理 | 使用飞桨静态图推理。 | +| `paddle_dynamic` | 飞桨动态图推理 | 使用飞桨动态图推理。 | +| `transformers` | Transformers 推理 | 使用 Hugging Face Transformers 推理。 | + +### 4.2 `engine_config` + +`engine_config` 用于配置推理引擎,建议与 `engine` 搭配使用。各引擎常见的 `engine_config` 字段如下: + +#### `paddle_static` + +常见字段包括: + +- `run_mode`:运行模式,如 `paddle`、`trt_fp32`、`trt_fp16`、`mkldnn`; +- `device_type` / `device_id`:设备类型和设备编号; +- `cpu_threads`:CPU 推理线程数; +- `delete_pass`:手动禁用的图优化 pass 列表; +- `enable_new_ir`:是否启用新 IR; +- `enable_cinn`:是否启用 CINN; +- `trt_cfg_setting`:TensorRT 底层配置; +- `trt_use_dynamic_shapes`:是否启用 TensorRT 动态形状; +- `trt_collect_shape_range_info`:是否采集 shape range 信息; +- `trt_discard_cached_shape_range_info`:是否丢弃已有 shape range 信息并重新采集; +- `trt_dynamic_shapes`:动态形状配置; +- `trt_dynamic_shape_input_data`:采集动态形状时用于填充输入张量的数据; +- `trt_shape_range_info_path`:shape range 信息文件路径; +- `trt_allow_rebuild_at_runtime`:运行时是否允许重建 TensorRT 引擎; +- `mkldnn_cache_capacity`:oneDNN(MKLDNN)缓存容量。 + +#### `paddle_dynamic` + +常见字段包括: + +- `device_type` / `device_id`:动态图执行时的设备类型和设备编号。 + +#### `transformers` + +常见字段包括: + +- `dtype`:模型权重 / 推理使用的数据类型,如 `float16`; +- `device_type` / `device_id`:推理设备类型和设备编号; +- `trust_remote_code`:是否信任并执行模型仓库中的自定义代码; +- `attn_implementation`:注意力实现方式,如 `flash_attention_2`; +- `generation_config`:生成参数,如 `max_new_tokens`、`temperature`; +- `model_kwargs`:传给模型加载接口的额外参数; +- `processor_kwargs`:传给 processor / image processor 加载接口的额外参数; +- `tokenizer_kwargs`:兼容保留字段,会与 `processor_kwargs` 合并使用。 + +#### 4.2.1 扁平与分桶 `engine_config` + +同一层级的 `engine_config` 可以是: + +- **扁平**:只包含**当前解析得到的引擎**所需的字段(例如仅使用静态图时,顶层直接是 `run_mode`、`cpu_threads` 等)。 +- **分桶**:顶层键**仅**为 PaddleX 已注册的引擎名(如 `paddle_static`、`paddle_dynamic`、`transformers` 等),每个键对应一个嵌套字典。**不得**在同一层级混用「分桶键」与扁平字段(例如 `{"paddle_static": {...}, "run_mode": "paddle"}` 会报错)。 + +解析为某一引擎时,只会使用与该引擎对应的一份配置:扁平形式直接参与校验;分桶形式则取出对应键下的字典。 + +### 4.3 优先级与覆盖规则 + +- 对于产线,命令行参数或 Python API 初始化参数中传入的 `engine`、`engine_config`,优先级高于产线配置文件中的同名字段; +- 在产线配置文件中,顶层 `engine`、`engine_config` 会作为全局配置,子模块或子产线中的 `engine`、`engine_config` 可覆盖上层配置; +- 关于产线配置文件中更完整的优先级、覆盖和调用规则,建议参考 PaddleX 文档:[PaddleX 产线 Python 脚本使用说明](https://paddlepaddle.github.io/PaddleX/latest/pipeline_usage/instructions/pipeline_python_API.html)。 + +### 4.4 兼容性规则 + +- 显式设置 `engine` 后,`enable_hpi` 不再生效; +- 显式传入 `engine_config` 后,与该引擎对应的兼容参数会被忽略。例如在 `paddle` / `paddle_static` 场景下,`use_tensorrt`、`precision`、`enable_mkldnn`、`mkldnn_cache_capacity`、`cpu_threads`、`enable_cinn` 等兼容参数不再生效。 + +## 5. 调用示例 + +### 5.1 单模型(CLI):通过 `--engine` 选择引擎 + +```bash +paddleocr text_detection -i general_ocr_001.png --engine transformers +``` + +### 5.2 单模型(Python):显式指定 `transformers` + +```python +from paddleocr import TextDetection + +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="transformers", +) + +result = model.predict("general_ocr_001.png") +``` + +### 5.3 单模型(Python):指定 `paddle_static` 与 `engine_config` + +```python +from paddleocr import TextDetection + +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="paddle_static", + engine_config={ + "device_type": "cpu", + "cpu_threads": 4, + "run_mode": "mkldnn", + }, +) + +result = model.predict("general_ocr_001.png") +``` + +### 5.4 产线(CLI):通过 `--engine` 选择引擎 + +```bash +paddleocr ocr -i general_ocr_001.png --engine paddle_static +``` + +### 5.5 产线(Python API):为某个模块单独配置推理引擎 + +如需为产线中的某一个模块单独指定 `engine`、`engine_config`,可先导出配置文件,修改对应模块配置后,再通过加载配置文件。配置文件的导出、编辑与加载方式可参见 [使用 PaddleX 产线配置文件](./paddleocr_and_paddlex.md#3-paddlex)。示例如下: + +首先,导出产线配置文件: + +```python +from paddleocr import PaddleOCR + +pipeline = PaddleOCR() +pipeline.export_paddlex_config_to_yaml("ocr_config.yaml") +``` + +然后,在 `ocr_config.yaml` 中为 `TextDetection` 模块单独设置 `engine`、`engine_config`: + +```yaml +pipeline_name: OCR +SubModules: + TextDetection: + engine: paddle_static + engine_config: + device_type: cpu + cpu_threads: 4 + run_mode: mkldnn +``` + +使用更新后的配置文件完成推理: + +```python +from paddleocr import PaddleOCR + +pipeline = PaddleOCR(paddlex_config="ocr_config.yaml") +result = pipeline.predict("general_ocr_001.png") +``` diff --git a/docs/version3.x/installation.en.md b/docs/version3.x/installation.en.md index a157ab37f34..16ddbee85ce 100644 --- a/docs/version3.x/installation.en.md +++ b/docs/version3.x/installation.en.md @@ -4,149 +4,73 @@ comments: true # Installation +## 1. Install the inference engine, PaddleOCR Python package, and optional feature dependencies -# 1. Install PaddlePaddle Framework +This section explains how to install the inference engine as needed, the `paddleocr` distribution package, and optional dependency groups by capability domain. This path covers running pretrained pipelines for inference locally, as well as auxiliary features such as document format conversion. **Model training and model export** are covered in Section 2 and are independent of the installation path above. -When installing PaddlePaddle, you can choose to install it via Docker or pip. +### 1.1 Install the inference engine (as needed) -## 1.1 Installing PaddlePaddle via Docker -If you choose to install via Docker, please refer to the following commands to use the official Docker image of the PaddlePaddle framework to create a container named `paddleocr` and map the current working directory to the `/paddle` directory inside the container: +PaddleOCR 3.5 uses a unified inference-engine configuration and can use backends such as PaddlePaddle and Transformers. To actually run model inference, install your chosen inference engine by following [Inference Engine and Configuration](./inference_engine.en.md). -If your Docker version >= 19.03, please use: - -```bash -# For CPU users: -docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash - -# For GPU users: -# gpu,requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows) -docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash - -# gpu,requires GPU driver version ≥550.54.14 (Linux) or ≥550.54.14 (Windows) -docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash -``` - -* If your Docker version <= 19.03 and >= 17.06, please use: - -
Click Here - -
# For CPU users:
-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash
-
-# For GPU users:
-# CUDA 11.8 users
-nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
-
-# CUDA 12.3 users
-nvidia-docker run --name paddleocr -v $PWD:/paddle  --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
-
- -* If your Docker version <= 17.06, please update your Docker. +### 1.2 Install paddleocr - -* Note: For more official PaddlePaddle Docker images, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/en/install/docker/linux-docker.html) - -## 1.2 Installing PaddlePaddle via pip -If you choose to install via pip, please refer to the following commands to install PaddlePaddle in your current environment using pip: +Install the latest `paddleocr` from PyPI: ```bash -# CPU -python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ - -# gpu,requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows) - python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ - -# gpu,requires GPU driver version ≥550.54.14 (Linux) or ≥550.54.14 (Windows) - python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ -``` - - -Note: For more PaddlePaddle Wheel versions, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/en/install/pip/linux-pip.html). - -After installation, you can verify if PaddlePaddle is successfully installed using the following command: - -```bash -python -c "import paddle; print(paddle.__version__)" -``` -If the installation is successful, the following content will be output: - -```bash -3.2.0 -``` - -## 1.3 Installation of PaddlePaddle Wheel Package for Windows with NVIDIA 50 Series GPUs - -The standard installation of PaddlePaddle does not fully support NVIDIA 50 series GPUs on Windows operating systems. Therefore, we provide a specially adapted PaddlePaddle package for this hardware environment. Please select the corresponding wheel file according to your Python version for installation. - -```bash -# python 3.9 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp39-cp39-win_amd64.whl - -# python 3.10 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp310-cp310-win_amd64.whl - -# python 3.11 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp311-cp311-win_amd64.whl - -# python 3.12 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp312-cp312-win_amd64.whl -``` -**Note:** The currently released PaddlePaddle wheel package for Windows systems with 50 series GPUs has known issues with text recognition model training, and related functionalities are still being adapted and improved. - - -# 2. Install PaddleOCR - -If you only want to use the inference capabilities of PaddleOCR, please refer to [Install Inference Package](#21-install-inference-package); if you want to perform model training, exporting, etc., please refer to [Install Training Dependencies](#22-install-training-dependencies). It is allowed to install both the inference package and training dependencies in the same environment without the need for environment isolation. - -## 2.1 Install Inference Package - -Install the latest version of the PaddleOCR inference package from PyPI: - -```bash -# If you only want to use the basic text recognition feature (returning text position coordinates and content) +# Default capabilities only: general OCR and document image preprocessing python -m pip install paddleocr -# If you want to use all functionalities, such as document parsing, document understanding, document translation, and key information extraction +# All optional capabilities: document parsing, document understanding, +# document translation, key information extraction, etc. # python -m pip install "paddleocr[all]" ``` -Or install from source (default is the development branch): +Or install from source (tracks the repository’s current default branch by default): ```bash -# If you only want to use the basic text recognition feature (returning text position coordinates and content) +# Default capabilities only: general OCR and document image preprocessing python -m pip install "paddleocr@git+https://github.com/PaddlePaddle/PaddleOCR.git" -# If you want to use all functionalities, such as document parsing, document understanding, document translation, and key information extraction +# All optional capabilities: document parsing, document understanding, +# document translation, key information extraction, etc. # python -m pip install "paddleocr[all]@git+https://github.com/PaddlePaddle/PaddleOCR.git" ``` -In addition to the `all` dependency group demonstrated above, PaddleOCR also supports installing specific optional features by specifying other dependency groups. The available dependency groups provided by PaddleOCR are as follows: +### 1.3 Choose dependency groups by capability + +Besides `all`, you can enable selected optional capabilities by specifying dependency groups. Each group corresponds to a capability domain (document parsing, information extraction, document translation, etc.). The available groups are: | Dependency Group Name | Corresponding Functionality | | - | - | -| `doc-parser` | Document parsing: can be used to extract layout elements such as tables, formulas, stamps, images, etc. from documents; includes models like PP-StructureV3 | -| `ie` | Information extraction: can be used to extract key information from documents, such as names, dates, addresses, amounts, etc.; includes models like PP-ChatOCRv4 | -| `trans` | Document translation: can be used to translate documents from one language to another; includes models like PP-DocTranslation | -| `all` | Complete functionality | +| `doc-parser` | Document parsing. Extracts layout elements such as tables, formulas, seals, and images from documents. Includes model solutions such as PP-StructureV3 | +| `ie` | Information extraction. Extracts key information such as names, dates, addresses, and amounts from documents. Includes model solutions such as PP-ChatOCRv4 | +| `trans` | Document translation. Translates documents from one language to another. Includes model solutions such as PP-DocTranslation | +| `doc2md` | Document-to-Markdown conversion. Quickly turns Word, Excel, and PowerPoint files into readable text | +| `all` | Full functionality | + +The general OCR pipeline and the document image preprocessing pipeline require no extra dependency groups; document parsing, information extraction, document translation, and other capabilities follow the table above. See each pipeline’s documentation for its dependency group. For individual modules, install any dependency group that contains the module to use its basic functionality. + +## 2. Install training and export dependencies -The general OCR pipeline (e.g., PP-OCRv3/v4/v5) and the document image preprocessing pipeline can be used without installing any additional dependency groups. Apart from these two pipelines, each remaining pipeline belongs to one and only one dependency group. You can refer to the usage documentation of each pipeline to determine which group it belongs to. For individual functional modules, installing any dependency group that includes the module will enable access to its core functionality. +To train models or export models, install the training-related dependencies separately. This path is a different installation dimension from the `paddleocr` package and optional groups in Section 1; both can coexist in one environment without mandatory isolation. -## 2.2 Install Training Dependencies +Training and export depend on the PaddlePaddle framework. Complete PaddlePaddle installation first by following [PaddlePaddle Framework Installation](./paddlepaddle_installation.en.md). -To perform model training, exporting, etc., first clone the repository to your local machine: +Clone this repository locally, then install the remaining dependencies: ```bash # Recommended method git clone https://github.com/PaddlePaddle/PaddleOCR # (Optional) Switch to a specific branch -git checkout release/3.2 +git checkout release/3.5 -# If you encounter network issues preventing successful cloning, you can also use the repository on Gitee: +# If cloning fails because of network issues, you can also use the Gitee repository: git clone https://gitee.com/paddlepaddle/PaddleOCR -# Note: The code hosted on Gitee may not be synchronized in real-time with updates from this GitHub project, with a delay of 3~5 days. Please prioritize using the recommended method. +# Note: The code hosted on Gitee may lag behind the GitHub repository by 3 to 5 days. +# Please prioritize the recommended method. ``` -Run the following command to install the dependencies: +Run the following command to install the remaining training dependencies: ```bash python -m pip install -r requirements.txt diff --git a/docs/version3.x/installation.md b/docs/version3.x/installation.md index 6bfdc78e286..21262a3e36b 100644 --- a/docs/version3.x/installation.md +++ b/docs/version3.x/installation.md @@ -4,141 +4,62 @@ comments: true # 安装 -# 1. 安装飞桨框架 +## 1. 安装推理引擎、PaddleOCR Python 库及可选功能依赖 -安装飞桨 PaddlePaddle 时,支持通过 Docker 安装和通过 pip 安装。 +本节说明如何按需安装推理引擎、`paddleocr` 分发包,以及按能力域启用的可选依赖组。这条路径适用于在本地调用预训练产线完成推理,或是使用文档格式转换等辅助功能。**模型训练与模型导出**见第 2 节,与上述安装路径相互独立。 -## 1.1 基于 Docker 安装飞桨 +### 1.1 安装推理引擎(按需) -若您通过 Docker 安装,请参考下述命令,使用飞桨框架官方 Docker 镜像,创建一个名为 `paddleocr` 的容器,并将当前工作目录映射到容器内的 `/paddle` 目录: +PaddleOCR 3.5 采用统一推理引擎配置,底层可对接飞桨、Transformers 等。若要实际执行模型推理,请参考 [推理引擎与配置说明](./inference_engine.md) 安装所选推理引擎。 -若您使用的 Docker 版本 >= 19.03,请执行: +### 1.2 安装 paddleocr -```bash -# 对于 cpu 用户: -docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash - -# 对于 gpu 用户: -# GPU 版本,需显卡驱动程序版本 ≥450.80.02(Linux)或 ≥452.39(Windows) -docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash - -# GPU 版本,需显卡驱动程序版本 ≥545.23.06(Linux)或 ≥545.84(Windows) -docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash -``` - -* 若您使用的 Docker 版本 <= 19.03 但 >= 17.06,请执行: - -
点击展开 - -
# 对于 cpu 用户:
-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash
-
-# 对于 gpu 用户:
-# CUDA11.8 用户
-nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
-
-# CUDA12.3 用户
-nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
-
- -* 若您使用的 Docker 版本 <= 17.06,请升级 Docker 版本。 - -* 注:更多飞桨官方 docker 镜像请参考[飞桨官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html)。 - -## 1.2 基于 pip 安装飞桨 - -若您通过 pip 安装,请参考下述命令,用 pip 在当前环境中安装飞桨 PaddlePaddle: +从 PyPI 安装最新版本的 `paddleocr`: ```bash -# CPU 版本 -python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ - -# GPU 版本,需显卡驱动程序版本 ≥450.80.02(Linux)或 ≥452.39(Windows) -python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ - -# GPU 版本,需显卡驱动程序版本 ≥550.54.14(Linux)或 ≥550.54.14(Windows) - python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ -``` - -> ❗ :无需关注物理机上的 CUDA 版本,只需关注显卡驱动程序版本。更多飞桨 Wheel 版本请参考[飞桨官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)。 - -安装完成后,使用以下命令可以验证 PaddlePaddle 是否安装成功: - -```bash -python -c "import paddle; print(paddle.__version__)" -``` - -如果已安装成功,将输出以下内容: - -```bash -3.2.0 -``` - -## 1.3 Windows 系统适配 NVIDIA 50 系显卡的 PaddlePaddle wheel 包安装 - -通过以上方式安装的 PaddlePaddle 在 Windows 操作系统下无法正常支持 NVIDIA 50 系显卡。因此,我们提供了专门适配该硬件环境的 PaddlePaddle 安装包。请根据您的 Python 版本选择对应的 wheel 文件进行安装。 - -```bash -# python 3.9 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp39-cp39-win_amd64.whl - -# python 3.10 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp310-cp310-win_amd64.whl - -# python 3.11 -python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp311-cp311-win_amd64.whl - -# python 3.12 -python -m https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp312-cp312-win_amd64.whl -``` - -**注:** 当前发布的适用于 Windows 系统 50 系显卡的 PaddlePaddle wheel 包,其文本识别模型的训练存在已知问题,相关功能仍在持续适配和完善中。 - -# 2. 安装 PaddleOCR - -如果只希望使用 PaddleOCR 的推理功能,请参考 [安装推理包](#21);如果希望进行模型训练、导出等,请参考 [安装训练依赖](#22)。在同一环境中安装推理包和训练依赖是允许的,无需进行环境隔离。 - -## 2.1 安装推理包 - -从 PyPI 安装最新版本 PaddleOCR 推理包: - -```bash -# 只希望使用基础文字识别功能(返回文字位置坐标和文本内容) +# 仅需要通用 OCR 与文档图像预处理等默认能力 python -m pip install paddleocr -# 希望使用文档解析、文档理解、文档翻译、关键信息抽取等全部功能 +# 需要文档解析、文档理解、文档翻译、关键信息抽取等全部可选能力 # python -m pip install "paddleocr[all]" ``` -或者从源码安装(默认为开发分支): +或从源码安装(默认跟踪仓库当前默认分支): ```bash -# 只希望使用基础文字识别功能(返回文字位置坐标和文本内容) +# 仅需要通用 OCR 与文档图像预处理等默认能力 python -m pip install "paddleocr@git+https://github.com/PaddlePaddle/PaddleOCR.git" -# 希望使用文档解析、文档理解、文档翻译、关键信息抽取等全部功能 +# 需要文档解析、文档理解、文档翻译、关键信息抽取等全部可选能力 # python -m pip install "paddleocr[all]@git+https://github.com/PaddlePaddle/PaddleOCR.git" ``` -除了上面演示的 `all` 依赖组以外,PaddleOCR 也支持通过指定其它依赖组,安装部分可选功能。PaddleOCR 提供的所有依赖组如下: +### 1.3 按功能选择依赖组 + +除 `all` 外,可通过指定依赖组启用部分可选能力。依赖组表示能力域(文档解析、信息抽取、文档翻译等)。各依赖组如下: | 依赖组名称 | 对应的功能 | | - | - | | `doc-parser` | 文档解析,可用于提取文档中的表格、公式、印章、图片等版面元素,包含 PP-StructureV3 等模型方案 | | `ie` | 信息抽取,可用于从文档中提取关键信息,如姓名、日期、地址、金额等,包含 PP-ChatOCRv4 等模型方案 | | `trans` | 文档翻译,可用于将文档从一种语言翻译为另一种语言,包含 PP-DocTranslation 等模型方案 | +| `doc2md` | 文档转 MarkDown,可用于将 Word、Excel、PowerPoint 文件快速转为可读文本 | | `all` | 完整功能 | -通用 OCR 产线(如 PP-OCRv3/v4/v5)、文档图像预处理产线的功能无需安装额外的依赖组即可使用。除了这两条产线外,每一条产线属于且仅属于一个依赖组。在各产线的使用文档中可以了解产线属于哪一依赖组。对于单功能模块,安装任意包含该模块的产线对应的依赖组后即可使用相关的基础功能。 +通用 OCR 产线与文档图像预处理产线无需额外依赖组;文档解析、信息抽取、文档翻译等按上表安装对应组。各产线所属依赖组见对应产线文档;单功能模块在任一包含该模块的依赖组安装后即可调用其基础能力。 + +## 2. 安装训练与导出依赖 + +若要进行模型训练或模型导出,需另行安装训练相关依赖。该路径与第 1 节的 `paddleocr` 包及可选依赖组属于不同安装维度;同一环境中可同时存在,无需强制隔离。 -## 2.2 安装训练依赖 +训练与导出依赖飞桨框架,请先参考[飞桨框架安装](./paddlepaddle_installation.md)完成 PaddlePaddle 安装。 -要进行模型训练、导出等,需要首先将仓库克隆到本地: +将本仓库克隆到本地后安装其余依赖: ```bash # 推荐方式 git clone https://github.com/PaddlePaddle/PaddleOCR # (可选)切换到指定分支 -git checkout release/3.2 +git checkout release/3.5 # 如果因为网络问题无法克隆成功,也可选择使用码云上的仓库: git clone https://gitee.com/paddlepaddle/PaddleOCR @@ -146,7 +67,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR # 注:码云托管代码可能无法实时同步本 GitHub 项目更新,存在3~5天延时,请优先使用推荐方式。 ``` -执行如下命令安装依赖: +执行如下命令安装其余训练依赖: ```bash python -m pip install -r requirements.txt diff --git a/docs/version3.x/logging.en.md b/docs/version3.x/logging.en.md index d8b46a43500..5cdd9f9564a 100644 --- a/docs/version3.x/logging.en.md +++ b/docs/version3.x/logging.en.md @@ -4,7 +4,7 @@ comments: true # Logging -This document mainly introduces how to configure the logging system for the PaddleOCR inference package. It's important to note that PaddleOCR's inference package uses a different logging system than the training scripts, and this document does not cover the configuration of the logging system used in the training scripts. +This document explains how to configure logging for the `paddleocr` Python package. The `paddleocr` package uses a different logging system than training scripts, and this document does not cover training-script logging configuration. PaddleOCR has built a centralized logging system based on Python's [`logging` standard library](https://docs.python.org/3/library/logging.html#module-logging). In other words, PaddleOCR uses a single logger, which can be accessed and configured via `paddleocr.logger`. diff --git a/docs/version3.x/logging.md b/docs/version3.x/logging.md index cf1d890ce02..1dd5a292100 100644 --- a/docs/version3.x/logging.md +++ b/docs/version3.x/logging.md @@ -4,7 +4,7 @@ comments: true # 日志 -本文档主要介绍如何配置 PaddleOCR 推理包的日志系统。需要注意的是,PaddleOCR 推理包与训练脚本使用的是不同的日志系统,本文档不涉及训练脚本所使用的日志系统的配置方法。 +本文档主要介绍如何配置通过 PyPI 安装的 `paddleocr` 包的日志系统。需要注意的是,`paddleocr` 与训练脚本使用的是不同的日志系统,本文档不涉及训练脚本所使用的日志系统的配置方法。 PaddleOCR 构建了一个基于 Python [`logging` 标准库](https://docs.python.org/zh-cn/3/library/logging.html#module-logging) 的集中式日志系统。换言之,PaddleOCR 使用唯一的日志记录器(logger),可通过 `paddleocr.logger` 访问和配置。 diff --git a/docs/version3.x/module_usage/chart_parsing.en.md b/docs/version3.x/module_usage/chart_parsing.en.md index 9e19eec0700..e27f11797ab 100644 --- a/docs/version3.x/module_usage/chart_parsing.en.md +++ b/docs/version3.x/module_usage/chart_parsing.en.md @@ -40,7 +40,19 @@ Run the following command to get started instantly: ```bash paddleocr chart_parsing -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png'}" -```` +``` + +The example above uses the paddle_dynamic inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr chart_parsing -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png'}" \ + --engine transformers +``` + +In most scenarios, the default `paddle_dynamic` inference engine delivers better inference performance and is the recommended first choice. **Note:** By default, PaddleOCR retrieves models from HuggingFace. If HuggingFace access is restricted in your environment, you can switch the model source to BOS by setting the environment variable: `PADDLE_PDX_MODEL_SOURCE="BOS"`. Support for more mainstream sources is planned. @@ -58,6 +70,27 @@ for res in results: res.save_to_json(f"./output/res.json") ``` +The example above uses the paddle_dynamic inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import ChartParsing +model = ChartParsing( + model_name="PP-Chart2Table", + engine="transformers", +) +results = model.predict( + input={"image": "chart_parsing_02.png"}, + batch_size=1 +) +for res in results: + res.print() + res.save_to_json(f"./output/res.json") +``` + +In most scenarios, the default `paddle_dynamic` inference engine delivers better inference performance and is the recommended first choice. + The output result will be: ```bash @@ -120,6 +153,18 @@ Defaults to GPU 0 if available; otherwise falls back to CPU. str | None None + +engine +Meaning: Inference engine.
Description: Supports None (the default), paddle, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_dynamic engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + @@ -232,4 +277,57 @@ Any positive integer. Currently, this module supports inference only and does not yet support fine-tuning. Fine-tuning capabilities are planned for future releases. -## 5. FAQ +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
PP-Chart2Tablepaddle_dynamic53.0017863.950.3017917.78
transformers23.9512217.370.4712269.98
+ +Test Environment Description: +
    +
  • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.jpg)
  • +
  • Hardware Configuration: +
      +
    • GPU: NVIDIA A100 40G
    • +
    • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
    • +
    +
  • +
  • Software Environment: +
      +
    • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
    • +
    • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
    • +
    +
  • +
+ +## 6. FAQ diff --git a/docs/version3.x/module_usage/chart_parsing.md b/docs/version3.x/module_usage/chart_parsing.md index bb5cc4954b6..e9860644188 100644 --- a/docs/version3.x/module_usage/chart_parsing.md +++ b/docs/version3.x/module_usage/chart_parsing.md @@ -42,6 +42,18 @@ comments: true paddleocr chart_parsing -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png'}" ``` +上述示例默认使用 paddle_dynamic 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr chart_parsing -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png'}" \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_dynamic` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将开放文档类视觉语言模型模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/chart_parsing_02.png)到本地。 @@ -58,6 +70,27 @@ for res in results: res.save_to_json(f"./output/res.json") ``` +上述示例默认使用 paddle_dynamic 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import ChartParsing +model = ChartParsing( + model_name="PP-Chart2Table", + engine="transformers", +) +results = model.predict( + input={"image": "chart_parsing_02.png"}, + batch_size=1 +) +for res in results: + res.print() + res.save_to_json(f"./output/res.json") +``` + +在大多数场景下,默认的 `paddle_dynamic` 推理引擎通常具备更好的推理性能,建议优先使用。 + 运行后,得到的结果为: ```bash @@ -118,6 +151,18 @@ for res in results: str|None None + +engine +含义:推理引擎。
说明:支持 None(默认值)、paddlepaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_dynamic 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + @@ -228,4 +273,57 @@ for res in results: 当前模块暂时不支持微调训练,仅支持推理集成。关于该模块的微调训练,计划在未来支持。 -## 五、FAQ +## 五、推理引擎 + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
PP-Chart2Tablepaddle_dynamic53.0017863.950.3017917.78
transformers23.9512217.370.4712269.98
+ +测试环境说明: +
    +
  • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.jpg)
  • +
  • 硬件配置: +
      +
    • GPU:NVIDIA A100 40G
    • +
    • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
    • +
    +
  • +
  • 软件环境: +
      +
    • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
    • +
    • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
    • +
    +
  • +
+ +## 六、FAQ diff --git a/docs/version3.x/module_usage/doc_img_orientation_classification.en.md b/docs/version3.x/module_usage/doc_img_orientation_classification.en.md index 7125e0b7bcd..116f7ca5382 100644 --- a/docs/version3.x/module_usage/doc_img_orientation_classification.en.md +++ b/docs/version3.x/module_usage/doc_img_orientation_classification.en.md @@ -10,7 +10,7 @@ The Document Image Orientation Classification Module is primarily designed to di ## 2. Supported Models List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -51,7 +51,7 @@ The Document Image Orientation Classification Module is primarily designed to di
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -94,6 +94,18 @@ You can quickly experience it with one command: paddleocr doc_img_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr doc_img_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference of the Document Image Orientation Classification Module into your project. Before running the following code, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg) to your local machine. @@ -109,6 +121,28 @@ for res in output: res.save_to_json("./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import DocImgOrientationClassification + +model = DocImgOrientationClassification( + model_name="PP-LCNet_x1_0_doc_ori", + engine="transformers", +) +output = model.predict("img_rot180_demo.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/demo.png") + res.save_to_json("./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result will be: ```bash @@ -166,6 +200,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -336,8 +382,74 @@ Positive integer.
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    -## IV. Secondary Development +## 4. Secondary Development Since PaddleOCR does not directly provide training functionality for document image orientation classification, if you need to train a document image orientation classification model, you can refer to the [PaddleX Secondary Development for Document Image Orientation Classification](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/doc_img_orientation_classification.html#iv-custom-development) section for training guidance. The trained model can be seamlessly integrated into PaddleOCR's API for inference purposes. -## V. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x1_0_doc_oripaddle_static2.213.360.065.74
    paddle_dynamic2.157.540.079.87
    transformers4.463.440.148.36
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Text Image Orientation Classification Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/doc_img_orientation_classification.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/doc_img_orientation_classification.md b/docs/version3.x/module_usage/doc_img_orientation_classification.md index 143ad6fad45..cb8d8680938 100644 --- a/docs/version3.x/module_usage/doc_img_orientation_classification.md +++ b/docs/version3.x/module_usage/doc_img_orientation_classification.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -51,7 +51,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -96,6 +96,20 @@ comments: true paddleocr doc_img_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr doc_img_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +如需了解推理引擎的介绍、速度数据及权重转换等详细信息,请参考后文 [推理引擎](#五推理引擎) 章节。 + 您也可以将文档图像方向分类模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg)到本地。 ```python @@ -109,6 +123,22 @@ for res in output: res.save_to_json("./output/res.json") ``` +如果希望使用 `transformers` 引擎进行推理,可以参考如下示例: + +```python +from paddleocr import DocImgOrientationClassification + +model = DocImgOrientationClassification( + model_name="PP-LCNet_x1_0_doc_ori", + engine="transformers", +) +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +随后继续按上文方式调用 `predict()` 即可。 + + 运行后,得到的结果为: ```bash @@ -166,6 +196,18 @@ for res in output: + + + + + + + + + + + + @@ -340,4 +382,70 @@ for res in output: 由于 PaddleOCR 并不直接提供文档图像方向分类的训练,因此,如果需要训练文档图像方向分类模型,可以参考 [PaddleX 文档图像方向分类二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/doc_img_orientation_classification.html#_5)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x1_0_doc_oripaddle_static2.213.360.065.74
    paddle_dynamic2.157.540.079.87
    transformers4.463.440.148.36
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/img_rot180_demo.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 文本图像方向分类模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/doc_img_orientation_classification.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/doc_vlm.en.md b/docs/version3.x/module_usage/doc_vlm.en.md index 6d09b4181da..a96050f9ad4 100644 --- a/docs/version3.x/module_usage/doc_vlm.en.md +++ b/docs/version3.x/module_usage/doc_vlm.en.md @@ -50,6 +50,8 @@ You can quickly experience it with one line of command: paddleocr doc_vlm -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出'}" ``` +The example above uses the paddle_dynamic inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference from the open document visual language model module into your project. Before running the following code, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png) locally. @@ -66,6 +68,8 @@ for res in results: res.save_to_json(f"./output/res.json") ``` +The example above uses the paddle_dynamic inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + After running, the result is: ```bash @@ -138,6 +142,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. str|None None + +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, and paddle_dynamic. When left as None, local inference uses the paddle_dynamic engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + diff --git a/docs/version3.x/module_usage/doc_vlm.md b/docs/version3.x/module_usage/doc_vlm.md index 1a3472b7e2e..6fdc4e47fcf 100644 --- a/docs/version3.x/module_usage/doc_vlm.md +++ b/docs/version3.x/module_usage/doc_vlm.md @@ -52,6 +52,8 @@ comments: true paddleocr doc_vlm -i "{'image': 'https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png', 'query': '识别这份表格的内容, 以markdown格式输出'}" ``` +上述示例默认使用 paddle_dynamic 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将开放文档类视觉语言模型模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png)到本地。 @@ -68,6 +70,8 @@ for res in results: res.save_to_json(f"./output/res.json") ``` +上述示例默认使用 paddle_dynamic 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 运行后,得到的结果为: ```bash @@ -141,6 +145,18 @@ for res in results: str|None None + +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_dynamic。保持为默认值 None 时,本地推理默认使用 paddle_dynamic 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + diff --git a/docs/version3.x/module_usage/formula_recognition.en.md b/docs/version3.x/module_usage/formula_recognition.en.md index deaa7f249f9..b7293e18e8b 100644 --- a/docs/version3.x/module_usage/formula_recognition.en.md +++ b/docs/version3.x/module_usage/formula_recognition.en.md @@ -10,7 +10,7 @@ The formula recognition module is a key component of an OCR (Optical Character R ## II. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -101,7 +101,7 @@ The formula recognition module is a key component of an OCR (Optical Character R
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -141,8 +141,11 @@ You can quickly try it out with a single command: ```bash paddleocr formula_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_formula_rec_001.png + ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference from the formula recognition module into your own project.Before running the code below, please download the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_formula_rec_001.png) locally. @@ -156,6 +159,9 @@ for res in output: res.save_to_img(save_path="./output/") res.save_to_json(save_path="./output/res.json") ``` + +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + After running, the output is: ```bash {'res': {'input_path': '/root/.paddlex/predict_input/general_formula_rec_001.png', 'page_index': None, 'rec_formula': '\\zeta_{0}(\\nu)=-\\frac{\\nu\\varrho^{-2\\nu}}{\\pi}\\int_{\\mu}^{\\infty}d\\omega\\int_{C_{+}}d z\\frac{2z^{2}}{(z^{2}+\\omega^{2})^{\\nu+1}}\\breve{\\Psi}(\\omega;z)e^{i\\epsilon z}\\quad,'}} @@ -211,6 +217,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + diff --git a/docs/version3.x/module_usage/formula_recognition.md b/docs/version3.x/module_usage/formula_recognition.md index 4c131adb52a..d4bd1829ad7 100644 --- a/docs/version3.x/module_usage/formula_recognition.md +++ b/docs/version3.x/module_usage/formula_recognition.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Whether to enable high-performance inference. bool
    @@ -105,7 +105,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -149,6 +149,8 @@ comments: true paddleocr formula_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_formula_rec_001.png ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将公式识别的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_formula_rec_001.png)到本地。 @@ -163,6 +165,8 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 运行后,得到的结果为: ```bash @@ -218,6 +222,18 @@ sudo apt-get install texlive texlive-latex-base texlive-xetex latex-cjk-all texl + + + + + + + + + + + + diff --git a/docs/version3.x/module_usage/layout_analysis.en.md b/docs/version3.x/module_usage/layout_analysis.en.md index 8d2f71c66c2..048cb886ef1 100644 --- a/docs/version3.x/module_usage/layout_analysis.en.md +++ b/docs/version3.x/module_usage/layout_analysis.en.md @@ -40,7 +40,7 @@ As shown in the figure above, PP-DocLayoutV2 embeds the targets detected by RT-D The following table only presents the layout detection accuracy of PP-DocLayoutV2. The evaluation dataset is a self-built layout region detection dataset, containing 1,000 images of various document types such as Chinese and English papers, magazines, newspapers, research reports, PPTs, exam papers, textbooks, etc., and covering 25 common layout element categories: document title, section header, text, vertical text, page number, abstract, table of contents, references, footnote, image caption, header, footer, header image, footer image, algorithm, inline formula, display formula, formula number, image, table, figure title (figure title, table title, chart title), seal, chart, aside text, and reference content. - +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Standard" values correspond to the local paddle_static inference engine.
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 是否启用高性能推理。 bool
    @@ -90,11 +90,26 @@ The following table only presents the layout detection accuracy of PP-DocLayoutV
    -## III. Quick Integration +## 3. Quick Integration > ❗ Before quick integration, please install the PaddleOCR wheel package. For detailed instructions, refer to [PaddleOCR Local Installation Tutorial](../installation.en.md)。 +You can quickly try it out with a single command: + +```bash +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg --model_name PP-DocLayoutV3 +``` + +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg --model_name PP-DocLayoutV3 \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable PADDLE_PDX_MODEL_SOURCE="BOS" to change the model source to BOS. In the future, more model sources will be supported. @@ -111,6 +126,28 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import LayoutDetection + +model = LayoutDetection( + model_name="PP-DocLayoutV2", + engine="transformers", +) +output = model.predict("layout.jpg", batch_size=1, layout_nms=True) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#42-weight-conversion) section in the [Inference Engine](#4-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result obtained is: ```bash @@ -171,6 +208,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. None +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi Meaning:Whether to enable high-performance inference. bool @@ -444,3 +493,89 @@ If set to None, the instantiation value is used; otherwise, this pa Get the visualized image in dict format + +## 4. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 4.1 Speed Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-DocLayoutV3paddle_static10.9547.9912.9772.33
    paddle_dynamic11.3384.481.3198.01
    transformers16.9447.1113.8378.97
    PP-DocLayoutV2paddle_static10.4830.941.3342.93
    paddle_dynamic11.0786.381.3399.80
    transformers16.7649.082.4369.30
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 4.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Layout Analysis Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/layout_analysis.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. diff --git a/docs/version3.x/module_usage/layout_analysis.md b/docs/version3.x/module_usage/layout_analysis.md index a4cc1c2a41c..2143c49cd30 100644 --- a/docs/version3.x/module_usage/layout_analysis.md +++ b/docs/version3.x/module_usage/layout_analysis.md @@ -21,6 +21,8 @@ comments: true 下表仅给出 PP-DocLayoutV2 的版面检测精度。该精度指标的评估数据集是自建的版面区域检测数据集,包含了中英文论文、杂志、报纸、研报、PPT、试卷、课本等 1000 张文档类型图片,包含 25 类常见的版面元素:文档标题、段落标题、文本、竖排文本、页码、摘要、目录、参考文献、脚注、图像脚注、页眉、页脚、页眉图像、页脚图像、算法、行内公式、行间公式、公式编号、图像、表格、图和表标题(图标题、表格标题和图表标题)、印章、图表、侧栏文本和参考文献内容。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 + @@ -51,12 +53,47 @@ comments: true > ❗ 在快速开始前,请先安装 PaddleOCR 的 wheel 包,详细请参考 [安装教程](../installation.md)。 +使用一行命令即可快速体验: + +```bash +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg --model_name PP-DocLayoutV3 +``` + +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg --model_name PP-DocLayoutV3 \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 您可以将版面区域检测模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)到本地。 ```python from paddleocr import LayoutDetection -model = LayoutDetection(model_name="PP-DocLayoutV2") +model = LayoutDetection(model_name="PP-DocLayoutV3") +output = model.predict("layout.jpg", batch_size=1, layout_nms=True) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import LayoutDetection + +model = LayoutDetection( + model_name="PP-DocLayoutV3", + engine="transformers", +) output = model.predict("layout.jpg", batch_size=1, layout_nms=True) for res in output: res.print() @@ -64,6 +101,10 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#四推理引擎) 中的 [权重转换](#42-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ```bash @@ -123,6 +164,18 @@ for res in output: + + + + + + + + + + + + @@ -389,3 +442,89 @@ for res in output:
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 是否启用高性能推理。 bool获取格式为dict的可视化图像
    + +## 四、推理引擎 {#四推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 4.1 速度数据 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-DocLayoutV3paddle_static10.9547.9912.9772.33
    paddle_dynamic11.3384.481.3198.01
    transformers16.9447.1113.8378.97
    PP-DocLayoutV2paddle_static10.4830.941.3342.93
    paddle_dynamic11.0786.381.3399.80
    transformers16.7649.082.4369.30
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 4.2 权重转换 {#42-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 版面分析模块权重转换](https://paddlepaddle.github.io/PaddleX/main/module_usage/tutorials/ocr_modules/layout_analysis.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 diff --git a/docs/version3.x/module_usage/layout_detection.en.md b/docs/version3.x/module_usage/layout_detection.en.md index 1f7e76a0d85..fc37ef4bf5c 100644 --- a/docs/version3.x/module_usage/layout_detection.en.md +++ b/docs/version3.x/module_usage/layout_detection.en.md @@ -4,12 +4,13 @@ comments: true # Layout Detection Module Tutorial -## I. Overview +## 1. Overview + The core task of structure analysis is to parse and segment the content of input document images. By identifying different elements in the image (such as text, charts, images, etc.), they are classified into predefined categories (e.g., pure text area, title area, table area, image area, list area, etc.), and the position and size of these regions in the document are determined. -## II. Supported Model List +## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. * The layout detection model includes 20 common categories: document title, paragraph title, text, page number, abstract, table, references, footnotes, header, footer, algorithm, formula, formula number, image, table, seal, figure_table title, chart, and sidebar text and lists of references @@ -270,7 +271,7 @@ The core task of structure analysis is to parse and segment the content of input
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -306,7 +307,7 @@ The core task of structure analysis is to parse and segment the content of input -## III. Quick Integration +## 3. Quick Integration > ❗ Before quick integration, please install the PaddleOCR wheel package. For detailed instructions, refer to [PaddleOCR Local Installation Tutorial](../installation.en.md)。 @@ -314,8 +315,21 @@ Quickly experience with just one command: ```bash paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg + +``` + +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg \ + --engine transformers ``` +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable PADDLE_PDX_MODEL_SOURCE="BOS" to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference from the layout area detection module into your project. Before running the following code, please download [Example Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg) Go to the local area. @@ -331,6 +345,28 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import LayoutDetection + +model = LayoutDetection( + model_name="PP-DocLayout_plus-L", + engine="transformers", +) +output = model.predict("layout.jpg", batch_size=1, layout_nms=True) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result obtained is: ```bash @@ -395,6 +431,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -671,8 +719,96 @@ If set to None, the instantiation value is used; otherwise, this pa
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    -## IV. Custom Development +## 4. Custom Development Since PaddleOCR does not directly provide training for the layout detection module, if you need to train the layout area detection model, you can refer to [PaddleX Layout Detection Module Secondary Development](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/layout_detection.html#iv-custom-development)Partially conduct training. The trained model can be seamlessly integrated into PaddleOCR's API for inference. -## V. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-DocLayout_plus-Lpaddle_static10.9226.110.1637.38
    paddle_dynamic11.0972.910.1685.10
    transformers12.6537.910.7552.24
    PP-DocBlockLayoutpaddle_static9.5127.590.0837.41
    paddle_dynamic8.9470.770.0780.73
    transformers11.3737.950.7550.96
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Layout Detection Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/layout_detection.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/layout_detection.md b/docs/version3.x/module_usage/layout_detection.md index 511bb74d862..6ff0d442b6a 100644 --- a/docs/version3.x/module_usage/layout_detection.md +++ b/docs/version3.x/module_usage/layout_detection.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 * 版面检测模型,包含20个常见的类别:文档标题、段落标题、文本、页码、摘要、目录、参考文献、脚注、页眉、页脚、算法、公式、公式编号、图像、表格、图和表标题(图标题、表格标题和图表标题)、印章、图表、侧栏文本和参考文献内容 @@ -274,7 +274,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -318,6 +318,18 @@ comments: true paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr layout_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将版面区域检测模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)到本地。 @@ -333,6 +345,28 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import LayoutDetection + +model = LayoutDetection( + model_name="PP-DocLayout_plus-L", + engine="transformers", +) +output = model.predict("layout.jpg", batch_size=1, layout_nms=True) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ```bash @@ -398,6 +432,18 @@ for res in output: + + + + + + + + + + + + @@ -670,4 +716,92 @@ for res in output: 由于 PaddleOCR 并不直接提供版面区域检测模块的训练,因此,如果需要训练版面区域测模型,可以参考 [PaddleX 版面区域检测模块二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/layout_detection.html#_5)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-DocLayout_plus-Lpaddle_static10.9226.110.1637.38
    paddle_dynamic11.0972.910.1685.10
    transformers12.6537.910.7552.24
    PP-DocBlockLayoutpaddle_static9.5127.590.0837.41
    paddle_dynamic8.9470.770.0780.73
    transformers11.3737.950.7550.96
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 版面区域检测模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/layout_detection.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/seal_text_detection.en.md b/docs/version3.x/module_usage/seal_text_detection.en.md index 06a41e37a45..eebac34edee 100644 --- a/docs/version3.x/module_usage/seal_text_detection.en.md +++ b/docs/version3.x/module_usage/seal_text_detection.en.md @@ -9,7 +9,7 @@ The seal text detection module typically outputs multi-point bounding boxes arou ## II. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -59,7 +59,7 @@ The seal text detection module typically outputs multi-point bounding boxes arou
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -101,8 +101,11 @@ Quickly experience with just one command: ```bash paddleocr seal_text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/seal_text_det.png + ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference from the layout area detection module into your project. Before running the following code, please download [Example Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/seal_text_det.png) Go to the local area. @@ -117,6 +120,8 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + After running, the result is: ```bash @@ -181,6 +186,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + diff --git a/docs/version3.x/module_usage/seal_text_detection.md b/docs/version3.x/module_usage/seal_text_detection.md index 9e3890510fa..4f5ceeb581f 100644 --- a/docs/version3.x/module_usage/seal_text_detection.md +++ b/docs/version3.x/module_usage/seal_text_detection.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    @@ -59,7 +59,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -102,6 +102,8 @@ comments: true paddleocr seal_text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/seal_text_det.png ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将印章文本检测的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/seal_text_det.png)到本地。 @@ -116,6 +118,8 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + 运行后,得到的结果为: ```bash @@ -180,6 +184,18 @@ for res in output: + + + + + + + + + + + + diff --git a/docs/version3.x/module_usage/table_cells_detection.en.md b/docs/version3.x/module_usage/table_cells_detection.en.md index 81236898583..157269af21c 100644 --- a/docs/version3.x/module_usage/table_cells_detection.en.md +++ b/docs/version3.x/module_usage/table_cells_detection.en.md @@ -6,13 +6,13 @@ comments: true # Table Cell Detection Module Usage Tutorial -## I. Overview +## 1. Overview The Table Cell Detection Module is a key component of the table recognition task, responsible for locating and marking each cell region in table images. The performance of this module directly affects the accuracy and efficiency of the entire table recognition process. The Table Cell Detection Module typically outputs bounding boxes for each cell region, which are then passed as input to the table recognition pipeline for further processing. -## II. Supported Model List +## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Regular Mode" values correspond to the local paddle_static inference engine.
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    @@ -53,7 +53,7 @@ The Table Cell Detection Module is a key component of the table recognition task
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -86,7 +86,7 @@ The Table Cell Detection Module is a key component of the table recognition task
    -## III. Quick Start +## 3. Quick Start > ❗ Before starting quickly, please first install the PaddleOCR wheel package. For details, please refer to the [installation tutorial](../installation.en.md). @@ -94,8 +94,21 @@ You can quickly experience it with one command: ```bash paddleocr table_cells_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg + +``` + +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr table_cells_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers ``` +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate model inference from the table cell detection module into your project. Before running the following code, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg) locally. @@ -110,6 +123,27 @@ for res in output: res.save_to_json("./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TableCellsDetection +model = TableCellsDetection( + model_name="RT-DETR-L_wired_table_cell_det", + engine="transformers", +) +output = model.predict("table_recognition.jpg", threshold=0.3, batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/") + res.save_to_json("./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result obtained is: ``` @@ -171,6 +205,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. None +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi Meaning:Whether to enable high-performance inference. bool @@ -359,8 +405,96 @@ Positive integer. -## IV. Secondary Development +## 4. Secondary Development Since PaddleOCR does not directly provide training for the table cell detection module, if you need to train a table cell detection model, you can refer to the [PaddleX Table Cell Detection Module Secondary Development](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_cells_detection.html#iv-secondary-development) section for training. The trained model can be seamlessly integrated into the PaddleOCR API for inference. -## V. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    RT-DETR-L_wired_table_cell_detpaddle_static3.5923.110.1427.02
    paddle_dynamic4.0470.380.1575.49
    transformers3.6937.300.7142.10
    RT-DETR-L_wireless_table_cell_detpaddle_static3.7723.440.1427.52
    paddle_dynamic4.0169.970.1575.10
    transformers3.6937.110.7141.91
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Table Cell Detection Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_cells_detection.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/table_cells_detection.md b/docs/version3.x/module_usage/table_cells_detection.md index eb56420a5a0..e006ddb9a01 100644 --- a/docs/version3.x/module_usage/table_cells_detection.md +++ b/docs/version3.x/module_usage/table_cells_detection.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -52,7 +52,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -95,6 +95,18 @@ comments: true paddleocr table_cells_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr table_cells_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将表格单元格检测的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)到本地。 @@ -109,6 +121,27 @@ for res in output: res.save_to_json("./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TableCellsDetection +model = TableCellsDetection( + model_name="RT-DETR-L_wired_table_cell_det", + engine="transformers", +) +output = model.predict("table_recognition.jpg", threshold=0.3, batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/") + res.save_to_json("./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ``` @@ -171,6 +204,18 @@ for res in output: + + + + + + + + + + + + @@ -375,4 +420,92 @@ for res in output: 由于 PaddleOCR 并不直接提供表格单元格检测模块的训练,因此,如果需要训练表格单元格检测模型,可以参考 [PaddleX 表格单元格检测模块二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_cells_detection.html#_4)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    RT-DETR-L_wired_table_cell_detpaddle_static3.5923.110.1427.02
    paddle_dynamic4.0470.380.1575.49
    transformers3.6937.300.7142.10
    RT-DETR-L_wireless_table_cell_detpaddle_static3.7723.440.1427.52
    paddle_dynamic4.0169.970.1575.10
    transformers3.6937.110.7141.91
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 表格单元格检测模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_cells_detection.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/table_classification.en.md b/docs/version3.x/module_usage/table_classification.en.md index e32294620a1..a0990ddfa2f 100644 --- a/docs/version3.x/module_usage/table_classification.en.md +++ b/docs/version3.x/module_usage/table_classification.en.md @@ -12,7 +12,7 @@ The Table Classification Module is a key component in computer vision systems, r ## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Regular Mode" values correspond to the local paddle_static inference engine. @@ -47,7 +47,7 @@ The Table Classification Module is a key component in computer vision systems, r
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -90,6 +90,18 @@ You can quickly experience it with one command: paddleocr table_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr table_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate model inference from the table classification module into your project. Before running the following code, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg) locally. @@ -103,6 +115,26 @@ for res in output: res.save_to_json("./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TableClassification +model = TableClassification( + model_name="PP-LCNet_x1_0_table_cls", + engine="transformers", +) +output = model.predict("table_recognition.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_json("./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result obtained is: ``` @@ -161,6 +193,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -321,4 +365,70 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no Since PaddleOCR does not directly provide training for the table classification module, if you need to train a table classification model, you can refer to the [PaddleX Table Classification Module Secondary Development](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_classification.html#iv-secondary-development) section for training. The trained model can be seamlessly integrated into the PaddleOCR API for inference. -## 5. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x1_0_table_clspaddle_static3.563.380.067.11
    paddle_dynamic3.577.770.0711.52
    transformers9.303.720.1514.05
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_image_classification_001.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Table Classification Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_classification.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/table_classification.md b/docs/version3.x/module_usage/table_classification.md index 573d74ccfe9..347e2f60235 100644 --- a/docs/version3.x/module_usage/table_classification.md +++ b/docs/version3.x/module_usage/table_classification.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -45,7 +45,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -88,6 +88,18 @@ comments: true paddleocr table_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr table_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将表格分类的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)到本地。 @@ -101,6 +113,26 @@ for res in output: res.save_to_json("./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TableClassification +model = TableClassification( + model_name="PP-LCNet_x1_0_table_cls", + engine="transformers", +) +output = model.predict("table_recognition.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_json("./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ``` @@ -159,6 +191,18 @@ for res in output: + + + + + + + + + + + + @@ -322,4 +366,70 @@ for res in output: 由于 PaddleOCR 并不直接提供表格分类模块的训练,因此,如果需要训练表格分类模型,可以参考 [PaddleX 表格分类模块二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_classification.html#_5)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x1_0_table_clspaddle_static3.563.380.067.11
    paddle_dynamic3.577.770.0711.52
    transformers9.303.720.1514.05
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_image_classification_001.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 表格分类模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_classification.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/table_structure_recognition.en.md b/docs/version3.x/module_usage/table_structure_recognition.en.md index c117f94023d..65df4b43396 100644 --- a/docs/version3.x/module_usage/table_structure_recognition.en.md +++ b/docs/version3.x/module_usage/table_structure_recognition.en.md @@ -10,7 +10,7 @@ Table structure recognition is an important component of table recognition syste ## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -70,7 +70,7 @@ Table structure recognition is an important component of table recognition syste
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -114,6 +114,18 @@ Quickly experience with a single command: paddleocr table_structure_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr table_structure_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference of the table structure recognition module into your own project. Before running the code below, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg) to your local machine. @@ -127,6 +139,26 @@ for res in output: res.save_to_json("./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TableStructureRecognition +model = TableStructureRecognition( + model_name="SLANet", + engine="transformers", +) +output = model.predict(input="table_recognition.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_json("./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result is: ``` @@ -180,6 +212,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -411,4 +455,92 @@ You can evaluate the trained weights, such as `output/xxx/xxx.pdparams`, using t ``` At this point, secondary development is complete, and this static graph model can be directly integrated into the PaddleOCR API. -## 5. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    SLANeXt_wiredpaddle_static1.5030.910.2332.77
    paddle_dynamic1.7157.440.9160.23
    transformers4.0345.140.7451.12
    SLANeXt_wirelesspaddle_static1.6730.490.2232.51
    paddle_dynamic1.6857.240.9660.05
    transformers4.3045.510.7551.76
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Table Structure Recognition Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_structure_recognition.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/table_structure_recognition.md b/docs/version3.x/module_usage/table_structure_recognition.md index affc9e7ce7f..b40fbb90ae3 100644 --- a/docs/version3.x/module_usage/table_structure_recognition.md +++ b/docs/version3.x/module_usage/table_structure_recognition.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -70,7 +70,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -114,6 +114,18 @@ comments: true paddleocr table_structure_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr table_structure_recognition -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将表格结构识别的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)到本地。 @@ -127,6 +139,26 @@ for res in output: res.save_to_json("./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TableStructureRecognition +model = TableStructureRecognition( + model_name="SLANet", + engine="transformers", +) +output = model.predict(input="table_recognition.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_json("./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ``` @@ -180,6 +212,18 @@ for res in output: + + + + + + + + + + + + @@ -415,4 +459,92 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ ``` 至此,二次开发完成,该静态图模型可以直接集成到 PaddleOCR 的 API 中。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    SLANeXt_wiredpaddle_static1.5030.910.2332.77
    paddle_dynamic1.7157.440.9160.23
    transformers4.0345.140.7451.12
    SLANeXt_wirelesspaddle_static1.6730.490.2232.51
    paddle_dynamic1.6857.240.9660.05
    transformers4.3045.510.7551.76
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/table_recognition.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 表格结构识别模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_structure_recognition.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/text_detection.en.md b/docs/version3.x/module_usage/text_detection.en.md index a32f9cd9a27..64546c4a20f 100644 --- a/docs/version3.x/module_usage/text_detection.en.md +++ b/docs/version3.x/module_usage/text_detection.en.md @@ -9,7 +9,7 @@ The text detection module is a critical component of OCR (Optical Character Reco ## 2. Supported Models List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Standard Mode" values correspond to the local paddle_static inference engine. @@ -77,7 +77,7 @@ The text detection module is a critical component of OCR (Optical Character Reco
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -120,6 +120,18 @@ Use the following command for a quick experience: paddleocr text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.png ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.png \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference into your project. Before running the following code, download the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.png) locally. @@ -134,6 +146,27 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TextDetection +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="transformers", +) +output = model.predict("general_ocr_001.png", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + The output will be: ```bash @@ -199,6 +232,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -533,7 +578,95 @@ After export, the static graph model will be saved in `./PP-OCRv5_server_det_inf ``` The custom development is now complete. This static graph model can be directly integrated into PaddleOCR's API. -## 5. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-OCRv5_mobile_detpaddle_static11.4313.802.1527.58
    paddle_dynamic11.7048.362.4762.71
    transformers14.0518.453.9837.54
    PP-OCRv5_server_detpaddle_static13.2426.912.6343.05
    paddle_dynamic11.8245.562.5260.10
    transformers14.5613.767.4436.76
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Text Detection Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/text_detection.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ - Use parameters `limit_type` and `limit_side_len` to constrain image dimensions. - `limit_type` options: [`max`, `min`] diff --git a/docs/version3.x/module_usage/text_detection.md b/docs/version3.x/module_usage/text_detection.md index e1721c099a2..862f409aa7a 100644 --- a/docs/version3.x/module_usage/text_detection.md +++ b/docs/version3.x/module_usage/text_detection.md @@ -9,7 +9,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -77,7 +77,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -155,6 +155,18 @@ paddleocr text_detection -i general_ocr_001.png --model_name PP-OCRv5_server_det paddleocr text_detection -i ./images/ --model_name PP-OCRv5_mobile_det ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr text_detection -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.png \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; ### 3.3 Python API 使用 @@ -171,6 +183,27 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TextDetection +model = TextDetection( + model_name="PP-OCRv5_server_det", + engine="transformers", +) +output = model.predict("general_ocr_001.png", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ```bash @@ -235,6 +268,18 @@ for res in output: + + + + + + + + + + + + @@ -576,9 +621,97 @@ python3 tools/export_model.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml -o ``` 至此,二次开发完成,该静态图模型可以直接集成到 PaddleOCR 的 API 中。 -## 五、常见问题与解决方案 +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-OCRv5_mobile_detpaddle_static11.4313.802.1527.58
    paddle_dynamic11.7048.362.4762.71
    transformers14.0518.453.9837.54
    PP-OCRv5_server_detpaddle_static13.2426.912.6343.05
    paddle_dynamic11.8245.562.5260.10
    transformers14.5613.767.4436.76
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_001.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 文本检测模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/text_detection.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、常见问题与解决方案 -### 5.1 性能优化问题 +### 6.1 性能优化问题 #### Q: GPU推理速度慢怎么办? @@ -598,7 +731,7 @@ python3 tools/export_model.py -c configs/det/PP-OCRv5/PP-OCRv5_server_det.yml -o (4)限制GPU内存使用:设置`gpu_mem=200` (5)使用移动端模型:切换到`PP-OCRv5_mobile`系列模型 -### 5.2 检测精度问题 +### 6.2 检测精度问题 #### Q: 检测框不准确或漏检怎么办? @@ -616,7 +749,7 @@ model = TextDetection( (2)使用更精确的后处理模式:设置`det_db_score_mode="slow"` (3)启用膨胀处理:设置`use_dilation=True` -### 5.3 模型选择建议 +### 6.3 模型选择建议 #### Q: 如何选择合适的模型? @@ -626,7 +759,7 @@ model = TextDetection( - 实时处理:使用`PP-OCRv5_mobile_det`,推理速度快 - 批量处理:使用`PP-OCRv5_server_det`,精度高 -### 5.4 参数调优建议 +### 6.4 参数调优建议 #### Q: 如何调优检测参数? @@ -641,7 +774,7 @@ model = TextDetection( - **高速度配置**:`limit_side_len=640`, `thresh=0.5`, `box_thresh=0.7`, `unclip_ratio=1.2` - **平衡配置**:`limit_side_len=960`, `thresh=0.4`, `box_thresh=0.6`, `unclip_ratio=1.5` -### 5.5 错误处理 +### 6.5 错误处理 #### Q: 模型加载失败怎么办? diff --git a/docs/version3.x/module_usage/text_image_unwarping.en.md b/docs/version3.x/module_usage/text_image_unwarping.en.md index 2c6b7184fca..d9e99eb1bec 100644 --- a/docs/version3.x/module_usage/text_image_unwarping.en.md +++ b/docs/version3.x/module_usage/text_image_unwarping.en.md @@ -12,7 +12,7 @@ The primary purpose of text image rectification is to perform geometric transfor ## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -53,7 +53,7 @@ The primary purpose of text image rectification is to perform geometric transfor
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -96,6 +96,18 @@ You can quickly experience it with one command: paddleocr text_image_unwarping -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr text_image_unwarping -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable `PADDLE_PDX_MODEL_SOURCE="BOS"` to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the model inference from the image rectification module into your project. Before running the following code, please download the [sample image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg) locally. @@ -110,6 +122,25 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +The example above uses the paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TextImageUnwarping +model = TextImageUnwarping( + model_name="UVDoc", + engine="transformers", +) +output = model.predict("doc_test.jpg", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + After running, the result obtained is: ```bash @@ -163,6 +194,18 @@ By default, GPU 0 will be used if available; otherwise, the CPU will be used. + + + + + + + + + + + + @@ -183,9 +226,9 @@ For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6) +Options:"fp32", "fp16", etc. - + @@ -342,4 +385,64 @@ Positive integer. The current module does not support fine-tuning training and only supports inference integration. Concerning fine-tuning training for this module, there are plans to support it in the future. -## 5. FAQ +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning: Whether to use the high performance inference. boolprecision Meaning:Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
    Description: -Options: fp32, fp16, etc.
    strfp32"fp32"
    enable_mkldnn
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    UVDocpaddle_static14.9618.601.9336.66
    paddle_dynamic10.9027.591.9640.94
    transformers13.546.740.9133.07
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +## 6. FAQ diff --git a/docs/version3.x/module_usage/text_image_unwarping.md b/docs/version3.x/module_usage/text_image_unwarping.md index ff45439b14f..8fdd3c8240f 100644 --- a/docs/version3.x/module_usage/text_image_unwarping.md +++ b/docs/version3.x/module_usage/text_image_unwarping.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -51,7 +51,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -95,6 +95,18 @@ comments: true paddleocr text_image_unwarping -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr text_image_unwarping -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将图像矫正的模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg)到本地。 @@ -109,6 +121,25 @@ for res in output: res.save_to_json(save_path="./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TextImageUnwarping +model = TextImageUnwarping( + model_name="UVDoc", + engine="transformers", +) +output = model.predict("doc_test.jpg", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 运行后,得到的结果为: ```bash @@ -164,6 +195,18 @@ for res in output: + + + + + + + + + + + + @@ -337,4 +380,64 @@ for res in output: 当前模块暂时不支持微调训练,仅支持推理集成。关于该模块的微调训练,计划在未来支持。 -## 五、FAQ +## 五、推理引擎 + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    UVDocpaddle_static14.9618.601.9336.66
    paddle_dynamic10.9027.591.9640.94
    transformers13.546.740.9133.07
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/doc_test.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +## 六、FAQ diff --git a/docs/version3.x/module_usage/text_recognition.en.md b/docs/version3.x/module_usage/text_recognition.en.md index 2ff8ed90788..a52c71a0911 100644 --- a/docs/version3.x/module_usage/text_recognition.en.md +++ b/docs/version3.x/module_usage/text_recognition.en.md @@ -10,7 +10,7 @@ The text recognition module is the core part of the OCR (Optical Character Recog ## 2. List of Supported Models -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -497,7 +497,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/paddle_static inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TextRecognition +model = TextRecognition( + model_name="PP-OCRv5_server_rec", + engine="transformers", +) +output = model.predict(input="general_ocr_rec_001.png", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result is as follows: + ```bash {'res': {'input_path': 'general_ocr_rec_001.png', 'page_index': None, 'rec_text': '绿洲仕格维花园公寓', 'rec_score': 0.9823867082595825}} ``` @@ -608,6 +642,18 @@ By default, GPU 0 is used; if unavailable, CPU is used. + + + + + + + + + + + + @@ -627,7 +673,7 @@ For Paddle with CUDA 11.8, the compatible TensorRT version is 8.x (x>=6), recomm +Options: "fp32", "fp16". @@ -839,4 +885,92 @@ After exporting the model, the static graph model will be stored in `./PP-OCRv5_ ``` At this point, the secondary development is complete. This static graph model can be directly integrated into the PaddleOCR API. -## 5. FAQ +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning: Whether to enable high performance inference. boolprecision Meaning:Precision for TensorRT when using the Paddle Inference TensorRT subgraph engine.
    Description: -Options: fp32, fp16.
    str "fp32"
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-OCRv5_mobile_recpaddle_static1.946.691.009.76
    paddle_dynamic1.9735.381.1138.60
    transformers3.3117.700.5021.68
    PP-OCRv5_server_recpaddle_static1.9811.371.2114.69
    paddle_dynamic1.9823.891.3227.34
    transformers3.9911.690.5116.36
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_001.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Text Image Orientation Classification Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/text_recognition.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. + +## 6. FAQ diff --git a/docs/version3.x/module_usage/text_recognition.md b/docs/version3.x/module_usage/text_recognition.md index a9b49144b90..86840afe7c5 100644 --- a/docs/version3.x/module_usage/text_recognition.md +++ b/docs/version3.x/module_usage/text_recognition.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -506,7 +506,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TextRecognition +model = TextRecognition( + model_name="PP-OCRv5_server_rec", + engine="transformers", +) +output = model.predict(input="general_ocr_rec_001.png", batch_size=1) +for res in output: + res.print() + res.save_to_img(save_path="./output/") + res.save_to_json(save_path="./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ```bash {'res': {'input_path': 'general_ocr_rec_001.png', 'page_index': None, 'rec_text': '绿洲仕格维花园公寓', 'rec_score': 0.9823867082595825}} @@ -621,6 +654,18 @@ for res in output: + + + + + + + + + + + + @@ -863,4 +908,92 @@ Global.save_inference_dir="./PP-OCRv5_server_rec_infer/" ``` 至此,二次开发完成,该静态图模型可以直接集成到 PaddleOCR 的 API 中。 -## 五、FAQ +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-OCRv5_mobile_recpaddle_static1.946.691.009.76
    paddle_dynamic1.9735.381.1138.60
    transformers3.3117.700.5021.68
    PP-OCRv5_server_recpaddle_static1.9811.371.2114.69
    paddle_dynamic1.9823.891.3227.34
    transformers3.9911.690.5116.36
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_001.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 文字识别模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/text_recognition.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 + +## 六、FAQ diff --git a/docs/version3.x/module_usage/textline_orientation_classification.en.md b/docs/version3.x/module_usage/textline_orientation_classification.en.md index 0f3a92714bc..ba68f8477e3 100644 --- a/docs/version3.x/module_usage/textline_orientation_classification.en.md +++ b/docs/version3.x/module_usage/textline_orientation_classification.en.md @@ -9,7 +9,7 @@ The text line orientation classification module primarily distinguishes the orie ## 2. Supported Model List -> The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> The inference time only includes the model inference time and does not include the time for pre- or post-processing. The "Normal Mode" values correspond to the local paddle_static inference engine. @@ -61,7 +61,7 @@ The text line orientation classification module primarily distinguishes the orie
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -104,11 +104,21 @@ You can quickly experience the functionality with a single command: paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg ``` +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + Note: The official models would be download from HuggingFace by default. If can't access to HuggingFace, please set the environment variable PADDLE_PDX_MODEL_SOURCE="BOS" to change the model source to BOS. In the future, more model sources will be supported. You can also integrate the text line orientation classification model into your project. Run the following code after downloading the [example image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg) to your local machine. -```bash +```python from paddleocr import TextLineOrientationClassification model = TextLineOrientationClassification(model_name="PP-LCNet_x0_25_textline_ori") output = model.predict("textline_rot180_demo.jpg", batch_size=1) @@ -118,6 +128,25 @@ for res in output: res.save_to_json("./output/res.json") ``` +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured, and then run the following code: + +```python +from paddleocr import TextLineOrientationClassification +model = TextLineOrientationClassification( + model_name="PP-LCNet_x0_25_textline_ori", + engine="transformers", +) +output = model.predict("textline_rot180_demo.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/demo.png") + res.save_to_json("./output/res.json") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + +If you want to use the trained model with the `paddle_dynamic` or `transformers` engine, refer to the [Weight Conversion](#52-weight-conversion) section in the [Inference Engine](#5-inference-engine) section below to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + After running, the result obtained is: ```bash @@ -176,6 +205,18 @@ By default, GPU 0 is used if available; otherwise, CPU is used. + + + + + + + + + + + + @@ -391,3 +432,91 @@ Supporting multiple input types ## 4. Custom Development Since PaddleOCR does not natively support training for text line orientation classification, refer to [PaddleX's Custom Development Guide](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#iv-custom-development) for training. Trained models can seamlessly integrate into PaddleOCR's API for inference. + +If you want to use the `paddle_dynamic` or `transformers` engine with the trained model, please refer to the [Weight Conversion](#52-weight-conversion) section in [Inference Engine](#5-inference-engine) later in this document to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. + +## 5. Inference Engine + +For detailed descriptions, values, compatibility rules, and examples of the inference engine, please refer to Inference Engine and Configuration Description. + +### 5.1 Speed Data + +
    None
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, local inference uses the paddle_static engine by default. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning: Whether to enable high-performance inference. bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x0_25_textline_oripaddle_static0.302.890.063.34
    paddle_dynamic0.286.520.086.98
    transformers1.303.760.155.36
    PP-LCNet_x1_0_textline_oripaddle_static0.333.200.063.69
    paddle_dynamic0.297.600.078.06
    transformers1.283.470.145.04
    + +Test Environment Description: +
      +
    • Test Data: [Sample Image](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg)
    • +
    • Hardware Configuration: +
        +
      • GPU: NVIDIA A100 40G
      • +
      • CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • Software Environment: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 Weight Conversion + +When using the inference engine, the system will automatically download the official pre-trained model. If you need to use a self-trained model with the `paddle_dynamic` or `transformers` engine, please refer to the [PaddleX Text Line Orientation Classification Module Weight Conversion](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#442) section to convert the model from the `pdparams` format to the `safetensors` format using PaddleX. This allows seamless integration into the PaddleOCR API for inference. diff --git a/docs/version3.x/module_usage/textline_orientation_classification.md b/docs/version3.x/module_usage/textline_orientation_classification.md index 8b3537992a2..e344f2011f4 100644 --- a/docs/version3.x/module_usage/textline_orientation_classification.md +++ b/docs/version3.x/module_usage/textline_orientation_classification.md @@ -10,7 +10,7 @@ comments: true ## 二、支持模型列表 -> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。表格中的“常规模式”耗时对应本地 paddle_static 推理引擎。 @@ -63,7 +63,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -107,6 +107,18 @@ comments: true paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr textline_orientation_classification -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 注:PaddleOCR 官方模型默认从 HuggingFace 获取,如运行环境访问 HuggingFace 不便,可通过环境变量修改模型源为 BOS:`PADDLE_PDX_MODEL_SOURCE="BOS"`,未来将支持更多主流模型源; 您也可以将文本行方向分类模块中的模型推理集成到您的项目中。运行以下代码前,请您下载[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg)到本地。 @@ -121,6 +133,27 @@ for res in output: res.save_to_json("./output/res.json") ``` +上述示例默认使用 paddle_static 推理引擎,请先按照[飞桨框架安装](../paddlepaddle_installation.md)完成 PaddlePaddle 安装。 + +如果选择 `transformers` 作为推理引擎,请确保已配置 Transformers 环境,然后执行如下代码: + +```python +from paddleocr import TextLineOrientationClassification +model = TextLineOrientationClassification( + model_name="PP-LCNet_x0_25_textline_ori", + engine="transformers", +) +output = model.predict("textline_rot180_demo.jpg", batch_size=1) +for res in output: + res.print(json_format=False) + res.save_to_img("./output/demo.png") + res.save_to_json("./output/res.json") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + 运行后,得到的结果为: ```bash @@ -178,6 +211,18 @@ for res in output: + + + + + + + + + + + + @@ -351,3 +396,91 @@ for res in output: ## 四、二次开发 由于 PaddleOCR 并不直接提供文本行方向分类的训练,因此,如果需要训练文档图像方向分类模型,可以参考 [PaddleX 文本行方向分类二次开发](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#_5)部分进行训练。训练后的模型可以无缝集成到 PaddleOCR 的 API 中进行推理。 + +训练后的模型如果想使用 `paddle_dynamic` 或 `transformers` 引擎,请参考后文 [推理引擎](#五推理引擎) 中的 [权重转换](#52-权重转换) 部分将模型由 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式。 + +## 五、推理引擎 {#五推理引擎} + +关于推理引擎的详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 + +### 5.1 速度数据 + +
    None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,本地推理默认使用 paddle_static 引擎。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 bool
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelenginePreprocessing (ms)Inference (ms)PostProcessing (ms)End-to-End (ms)
    PP-LCNet_x0_25_textline_oripaddle_static0.302.890.063.34
    paddle_dynamic0.286.520.086.98
    transformers1.303.760.155.36
    PP-LCNet_x1_0_textline_oripaddle_static0.333.200.063.69
    paddle_dynamic0.297.600.078.06
    transformers1.283.470.145.04
    + +测试环境说明: +
      +
    • 测试数据:[示例图片](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/textline_rot180_demo.jpg)
    • +
    • 硬件配置: +
        +
      • GPU:NVIDIA A100 40G
      • +
      • CPU:Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
      • +
      +
    • +
    • 软件环境: +
        +
      • Ubuntu 22.04 / CUDA 12.6 / cuDNN 9.5
      • +
      • paddlepaddle-gpu 3.2.1 / paddleocr 3.5 / transformers 5.4.0 / torch 2.10
      • +
      +
    • +
    + +### 5.2 权重转换 {#52-权重转换} + +使用推理引擎时,系统会自动下载官方预训练模型。若需使用自训练模型配合 `paddle_dynamic` 或 `transformers` 引擎,请参考 [PaddleX 文本行方向分类模块权重转换](https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/textline_orientation_classification.html#442) 部分,将 `pdparams` 格式通过 PaddleX 转换为 `safetensors` 格式,即可无缝集成到 PaddleOCR 的 API 中进行推理。 diff --git a/docs/version3.x/other_devices_support/multi_devices_use_guide.en.md b/docs/version3.x/other_devices_support/multi_devices_use_guide.en.md index 71c72a3c9f3..5c229c6036f 100644 --- a/docs/version3.x/other_devices_support/multi_devices_use_guide.en.md +++ b/docs/version3.x/other_devices_support/multi_devices_use_guide.en.md @@ -5,6 +5,7 @@ comments: true # PaddleOCR Multi-Devices Usage Guide This document focuses on the usage guide of PaddleX for Huawei Ascend NPU and Kunlun XPU hardware platforms. +This document mainly introduces installation and usage with PaddlePaddle. If you plan to use another inference engine, please follow that engine's official documentation for environment setup and installation. ## 1、Installation diff --git a/docs/version3.x/other_devices_support/multi_devices_use_guide.md b/docs/version3.x/other_devices_support/multi_devices_use_guide.md index d528006598b..115f94cb501 100644 --- a/docs/version3.x/other_devices_support/multi_devices_use_guide.md +++ b/docs/version3.x/other_devices_support/multi_devices_use_guide.md @@ -5,6 +5,7 @@ comments: true # PaddleOCR多硬件使用指南 本文档主要针对昇腾 NPU、昆仑 XPU 等硬件平台,介绍 PaddleOCR 使用指南。 +本文档主要介绍基于飞桨框架的安装与使用方式。若需使用其他推理引擎,请按照对应引擎的官方文档完成环境安装与配置。 ## 1、安装 diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md index 582e80cff99..065a465233a 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.en.md @@ -5,6 +5,7 @@ comments: true # Ascend NPU PaddlePaddle Installation Tutorial Currently, PaddleOCR supports the Ascend 910B chip (more models are under support. If you have a related need for other models, please submit an issue to inform us). The Ascend driver version is 23.0.3. Considering the differences in environments, we recommend using the Ascend development image provided by PaddlePaddle to complete the environment preparation. +This guide mainly introduces installation and usage with PaddlePaddle. If you want to use a different inference engine, please follow that engine's official installation and configuration documentation. ## 1. Docker Environment Preparation * Pull the image. This image is only for the development environment and does not contain a pre-compiled PaddlePaddle installation package. The image has CANN-8.0.0, the Ascend operator library, installed by default. diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md index 02268d32f71..339905ea79f 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_NPU.md @@ -5,6 +5,7 @@ comments: true # 昇腾 NPU 飞桨安装教程 当前 PaddleOCR 支持昇腾 910B 芯片(更多型号还在支持中,如果您有其他型号的相关需求,请提交issue告知我们),昇腾驱动版本为 23.0.3。考虑到环境差异性,我们推荐使用飞桨官方提供的昇腾开发镜像完成环境准备。 +本文档主要介绍基于飞桨框架的环境安装与使用方式。若您计划使用其他推理引擎,请参考对应引擎的官方文档完成安装与配置。 ## 1、docker环境准备 * 拉取镜像,此镜像仅为开发环境,镜像中不包含预编译的飞桨安装包,镜像中已经默认安装了昇腾算子库 CANN-8.0.0。 diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.en.md b/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.en.md index e14c674e0d4..5d7dd14184d 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.en.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.en.md @@ -5,6 +5,7 @@ comments: true # Kunlun XPU PaddlePaddle Installation Tutorial Currently, PaddleOCR supports Kunlun R200/R300 and other chips. Considering environmental differences, we recommend using the Kunlun XPU development image officially released by PaddlePaddle, which is pre-installed with the Kunlun basic runtime environment library (XRE). +This guide mainly introduces installation and usage with PaddlePaddle. If you want to use a different inference engine, please follow that engine's official installation and configuration documentation. ## 1. Docker Environment Preparation Pull the image. This image is only for the development environment and does not include a pre-compiled PaddlePaddle installation package. diff --git a/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.md b/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.md index 6b99833aa1f..0c0ce6089b6 100644 --- a/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.md +++ b/docs/version3.x/other_devices_support/paddlepaddle_install_XPU.md @@ -5,6 +5,7 @@ comments: true # 昆仑 XPU 飞桨安装教程 当前 PaddleOCR 支持昆仑 R200/R300 等芯片。考虑到环境差异性,我们推荐使用飞桨官方发布的昆仑 XPU 开发镜像,该镜像预装有昆仑基础运行环境库(XRE)。 +本文档主要介绍基于飞桨框架的环境安装与使用方式。若您计划使用其他推理引擎,请参考对应引擎的官方文档完成安装与配置。 ## 1、docker环境准备 拉取镜像,此镜像仅为开发环境,镜像中不包含预编译的飞桨安装包 diff --git a/docs/version3.x/paddleocr_and_paddlex.en.md b/docs/version3.x/paddleocr_and_paddlex.en.md index c6711b851d4..4e1c1189194 100644 --- a/docs/version3.x/paddleocr_and_paddlex.en.md +++ b/docs/version3.x/paddleocr_and_paddlex.en.md @@ -20,7 +20,7 @@ PaddleOCR fully reuses the capabilities of PaddleX in the inference deployment p - The high-performance inference capabilities of PaddleOCR are achieved through PaddleX's Paddle2ONNX plugin and high-performance inference plugins. - The service deployment solutions of PaddleOCR are based on PaddleX's implementations. -It is important to note that although PaddleOCR uses PaddleX at the underlying level, thanks to PaddleX’s optional dependency installation feature, **installing the PaddleOCR inference package does not include all of PaddleX’s dependencies—only those required for OCR-related tasks are installed**. Therefore, users generally do not need to worry about excessive expansion of dependency size. Tested in May 2025, in an x86-64 + Linux + Python 3.10 environment, the total size of required dependencies increased only from 717 MB to 738 MB. +It is important to note that although PaddleOCR uses PaddleX at the underlying level, thanks to PaddleX’s optional dependency installation feature, **installing the `paddleocr` Python distribution package does not include all of PaddleX’s dependencies—only those required for OCR-related tasks are installed**. Therefore, users generally do not need to worry about excessive expansion of dependency size. Tested in May 2025, in an x86-64 + Linux + Python 3.10 environment, the total size of required dependencies increased only from 717 MB to 738 MB. The version correspondence between PaddleOCR, PaddleX, and the PaddlePaddle framework is as follows: @@ -34,6 +34,7 @@ The version correspondence between PaddleOCR, PaddleX, and the PaddlePaddle fram | `3.2.x` | `>= 3.2.0, < 3.3.0` | `>= 3.0.0` | | `3.3.x` | `>= 3.3.0, < 3.4.0` | `>= 3.0.0` | | `3.4.x` | `>= 3.4.0, < 3.5.0` | `>= 3.0.0` | +| `3.5.x` | `>= 3.5.0, < 3.6.0` | `>= 3.0.0` | ## 2. Correspondence Between PaddleOCR Pipelines and PaddleX Pipeline Registration Names diff --git a/docs/version3.x/paddleocr_and_paddlex.md b/docs/version3.x/paddleocr_and_paddlex.md index abb66cf716f..efb48f02c2c 100644 --- a/docs/version3.x/paddleocr_and_paddlex.md +++ b/docs/version3.x/paddleocr_and_paddlex.md @@ -20,7 +20,7 @@ PaddleOCR 在推理部署环节充分复用了 PaddleX 的能力,具体包括 - PaddleOCR 的高性能推理能力通过 PaddleX 的 Paddle2ONNX 插件及高性能推理插件实现。 - PaddleOCR 的服务化部署方案基于 PaddleX 的实现。 -需要特别说明的是,尽管 PaddleOCR 在底层使用了 PaddleX,但得益于 PaddleX 的可选依赖安装功能,**安装 PaddleOCR 推理包时并不会安装 PaddleX 的全部依赖,而只会安装 OCR 类任务需要使用到的依赖**,用户通常无需关心依赖体积的过度膨胀问题。2025 年 5 月测试,在 x86-64 + Linux + Python 3.10 环境中,需要安装的依赖总体积仅仅从 717 MB 增加到 738 MB。 +需要特别说明的是,尽管 PaddleOCR 在底层使用了 PaddleX,但得益于 PaddleX 的可选依赖安装功能,**安装 `paddleocr` Python 分发包时并不会安装 PaddleX 的全部依赖,而只会安装 OCR 类任务需要使用到的依赖**,用户通常无需关心依赖体积的过度膨胀问题。2025 年 5 月测试,在 x86-64 + Linux + Python 3.10 环境中,需要安装的依赖总体积仅仅从 717 MB 增加到 738 MB。 PaddleOCR、PaddleX 和飞桨框架的版本存在如下对应关系: @@ -34,6 +34,7 @@ PaddleOCR、PaddleX 和飞桨框架的版本存在如下对应关系: | `3.2.x` | `>= 3.2.0, < 3.3.0` | `>= 3.0.0` | | `3.3.x` | `>= 3.3.0, < 3.4.0` | `>= 3.0.0` | | `3.4.x` | `>= 3.4.0, < 3.5.0` | `>= 3.0.0` | +| `3.5.x` | `>= 3.5.0, < 3.6.0` | `>= 3.0.0` | ## 2. PaddleOCR 产线与 PaddleX 产线注册名的对应关系 diff --git a/docs/version3.x/paddlepaddle_installation.en.md b/docs/version3.x/paddlepaddle_installation.en.md new file mode 100644 index 00000000000..fa9936d88f5 --- /dev/null +++ b/docs/version3.x/paddlepaddle_installation.en.md @@ -0,0 +1,97 @@ +--- +comments: true +--- + +# PaddlePaddle Framework Installation + +This document explains how to install PaddlePaddle. The following scenarios usually require the PaddlePaddle framework to be installed first: + +- using the PaddlePaddle framework as the inference engine for pipeline/model inference; +- performing development tasks such as model training and export. + +## 1. Install PaddlePaddle with Docker + +If you install via Docker, use the following commands with the official PaddlePaddle Docker images to create a container named `paddleocr` and mount the current working directory to `/paddle` inside the container. + +If your Docker version is >= 19.03, run: + +```bash +# For CPU users: +docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash + +# For GPU users: +# GPU version, requires driver version >= 450.80.02 (Linux) or >= 452.39 (Windows) +docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash + +# GPU version, requires driver version >= 550.54.14 (Linux) or >= 550.54.14 (Windows) +docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash +``` + +If your Docker version is <= 19.03 but >= 17.06, run: + +
    Click to expand + +
    # For CPU users:
    +docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash
    +
    +# For GPU users:
    +# CUDA 11.8 users
    +nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
    +
    +# CUDA 12.6 users
    +nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
    +
    +
    + +If your Docker version is <= 17.06, please upgrade Docker first. + +For more official PaddlePaddle Docker images, see the [PaddlePaddle website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/en/install/docker/linux-docker.html). + +## 2. Install PaddlePaddle with pip + +If you install via pip, use the following commands to install PaddlePaddle in the current environment: + +```bash +# CPU version +python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ + +# GPU version, requires driver version >= 450.80.02 (Linux) or >= 452.39 (Windows) +python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + +# GPU version, requires driver version >= 550.54.14 (Linux) or >= 550.54.14 (Windows) +python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +``` + +> ❗ Note: You do not need to pay attention to the physical machine's CUDA version. You only need to care about the GPU driver version. For more PaddlePaddle wheel versions, see the [PaddlePaddle website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/en/install/pip/linux-pip.html). + +After installation, use the following command to verify whether PaddlePaddle is installed successfully: + +```bash +python -c "import paddle; print(paddle.__version__)" +``` + +If the installation succeeds, it will output a version number like: + +```bash +3.2.0 +``` + +## 3. Install PaddlePaddle wheel packages for NVIDIA 50-series GPUs on Windows + +PaddlePaddle installed using the methods above does not properly support NVIDIA 50-series GPUs on Windows. Therefore, we provide specially adapted PaddlePaddle packages for this hardware environment. Please choose the corresponding wheel file according to your Python version. + +```bash +# python 3.9 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp39-cp39-win_amd64.whl + +# python 3.10 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp310-cp310-win_amd64.whl + +# python 3.11 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp311-cp311-win_amd64.whl + +# python 3.12 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp312-cp312-win_amd64.whl +``` + +The currently released PaddlePaddle wheel packages for Windows 50-series GPUs still have known issues in text-recognition model training, and related support is still being adapted and improved. diff --git a/docs/version3.x/paddlepaddle_installation.md b/docs/version3.x/paddlepaddle_installation.md new file mode 100644 index 00000000000..31e3fb500f7 --- /dev/null +++ b/docs/version3.x/paddlepaddle_installation.md @@ -0,0 +1,97 @@ +--- +comments: true +--- + +# 飞桨框架安装 + +本文档说明如何安装 PaddlePaddle。以下场景通常需要先安装飞桨框架: + +- 在进行产线/模型推理时,使用飞桨框架作为推理引擎; +- 进行模型训练、导出等开发工作。 + +## 1. 基于 Docker 安装飞桨 + +若您通过 Docker 安装,请参考下述命令,使用飞桨框架官方 Docker 镜像,创建一个名为 `paddleocr` 的容器,并将当前工作目录映射到容器内的 `/paddle` 目录。 + +若您使用的 Docker 版本 >= 19.03,请执行: + +```bash +# 对于 CPU 用户: +docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash + +# 对于 GPU 用户: +# GPU 版本,需显卡驱动程序版本 ≥450.80.02(Linux)或 ≥452.39(Windows) +docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash + +# GPU 版本,需显卡驱动程序版本 ≥550.54.14(Linux)或 ≥550.54.14(Windows) +docker run --gpus all --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash +``` + +若您使用的 Docker 版本 <= 19.03 但 >= 17.06,请执行: + +
    点击展开 + +
    # 对于 CPU 用户:
    +docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0 /bin/bash
    +
    +# 对于 GPU 用户:
    +# CUDA11.8 用户
    +nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
    +
    +# CUDA12.6 用户
    +nvidia-docker run --name paddleocr -v $PWD:/paddle --shm-size=8G --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
    +
    +
    + +若您使用的 Docker 版本 <= 17.06,请升级 Docker 版本。 + +更多飞桨官方 Docker 镜像请参考[飞桨官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html)。 + +## 2. 基于 pip 安装飞桨 + +若您通过 pip 安装,请参考下述命令,用 pip 在当前环境中安装 PaddlePaddle: + +```bash +# CPU 版本 +python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ + +# GPU 版本,需显卡驱动程序版本 ≥450.80.02(Linux)或 ≥452.39(Windows) +python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + +# GPU 版本,需显卡驱动程序版本 ≥550.54.14(Linux)或 ≥550.54.14(Windows) +python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ +``` + +> ❗ :无需关注物理机上的 CUDA 版本,只需关注显卡驱动程序版本。更多飞桨 Wheel 版本请参考[飞桨官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)。 + +安装完成后,使用以下命令验证 PaddlePaddle 是否安装成功: + +```bash +python -c "import paddle; print(paddle.__version__)" +``` + +如果已安装成功,将输出如下版本号: + +```bash +3.2.0 +``` + +## 3. Windows 系统适配 NVIDIA 50 系显卡的 PaddlePaddle wheel 包安装 + +通过以上方式安装的 PaddlePaddle 在 Windows 操作系统下无法正常支持 NVIDIA 50 系显卡。因此,我们提供了专门适配该硬件环境的 PaddlePaddle 安装包。请根据您的 Python 版本选择对应的 wheel 文件进行安装。 + +```bash +# python 3.9 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp39-cp39-win_amd64.whl + +# python 3.10 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp310-cp310-win_amd64.whl + +# python 3.11 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp311-cp311-win_amd64.whl + +# python 3.12 +python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Windows-Gpu-Cuda12.9-Cudnn9.9-Trt10.5-Mkl-Avx-VS2019-SelfBuiltPypiUse/86d658f56ebf3a5a7b2b33ace48f22d10680d311/paddlepaddle_gpu-3.0.0.dev20250717-cp312-cp312-win_amd64.whl +``` + +当前发布的适用于 Windows 系统 50 系显卡的 PaddlePaddle wheel 包,其文本识别模型的训练存在已知问题,相关功能仍在持续适配和完善中。 diff --git a/docs/version3.x/pipeline_usage/OCR.en.md b/docs/version3.x/pipeline_usage/OCR.en.md index c5ff8d26e3c..0a3fa066793 100644 --- a/docs/version3.x/pipeline_usage/OCR.en.md +++ b/docs/version3.x/pipeline_usage/OCR.en.md @@ -23,6 +23,7 @@ The general OCR pipeline is used to solve text recognition tasks by extracting t In this pipeline, you can select models based on the benchmark test data provided below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Standard Mode / High-Performance Mode], the Standard Mode values correspond to the local `paddle_static` inference engine.
    Document Image Orientation Classification Module (Optional): @@ -656,7 +657,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Model/Command line supports more parameter settings. Click to expand for detailed instructions on command line parameters. @@ -984,33 +1039,38 @@ Supports specifying a specific card number: + + + + + + - + - + - - + - @@ -1018,20 +1078,20 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no - + - + - + @@ -1292,6 +1352,30 @@ If set to None, the default batch size will be 1. + + + + + + + + + + + + + + + + + + - @@ -1439,10 +1523,22 @@ Supports specifying a specific card number: + + + + + + + + + + + + - + @@ -1456,7 +1552,7 @@ For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6) - + @@ -1480,7 +1576,7 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no - + @@ -2449,6 +2545,39 @@ pipeline = PaddleOCR(text_detection_model_dir="./your_det_model_path") ``` +The example above uses the local `paddle_static` inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import PaddleOCR + +ocr = PaddleOCR( + use_doc_orientation_classify=False, # Disable document orientation classification + use_doc_unwarping=False, # Disable document unwarping + use_textline_orientation=False, # Disable textline orientation classification + engine="transformers", +) +# ocr = PaddleOCR(lang="en", engine="transformers") # Use the English model +# ocr = PaddleOCR(ocr_version="PP-OCRv4", engine="transformers") # Use another PP-OCR version +# ocr = PaddleOCR(device="gpu", engine="transformers") # Use GPU for inference +# ocr = PaddleOCR( +# text_detection_model_name="PP-OCRv5_server_det", +# text_recognition_model_name="PP-OCRv5_server_rec", +# use_doc_orientation_classify=False, +# use_doc_unwarping=False, +# use_textline_orientation=False, +# engine="transformers", +# ) # Switch to the PP-OCRv5_server models +result = ocr.predict("./general_ocr_002.png") +for res in result: + res.print() + res.save_to_img("output") + res.save_to_json("output") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + #### 4.2.2 Specify the local model path through the configuration file 1.Obtain the pipeline configuration file diff --git a/docs/version3.x/pipeline_usage/OCR.md b/docs/version3.x/pipeline_usage/OCR.md index 0a9bbb36306..15c009cfd45 100644 --- a/docs/version3.x/pipeline_usage/OCR.md +++ b/docs/version3.x/pipeline_usage/OCR.md @@ -23,6 +23,7 @@ OCR(光学字符识别,Optical Character Recognition)是一种将图像中 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地推理引擎 `paddle_static`。
    文档图像方向分类模块(可选): @@ -662,7 +663,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理模型/
    命令行支持更多参数设置,点击展开以查看命令行参数的详细说明 @@ -997,30 +1024,38 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv4
    + + + + + + - + - + - - + - @@ -1028,15 +1063,16 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv4 +含义:MKL-DNN 缓存容量。 + - + - + @@ -1044,6 +1080,7 @@ paddleocr ocr -i ./general_ocr_002.png --ocr_version PP-OCRv4 +
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    enable_hpiMeaning:Whether to enable high-performance inference.Meaning: Whether to enable high-performance inference. boolFalseNone
    use_tensorrtMeaning:Whether to use the Paddle Inference TensorRT subgraph engine.
    +
    Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False
    precisionMeaning:Computational precision, such as fp32, fp16.Meaning: Computation precision, such as fp32 or fp16. str fp32
    enable_mkldnnMeaning:Whether to enable MKL-DNN acceleration for inference.
    +
    Meaning: Whether to enable MKL-DNN accelerated inference.
    Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set.
    bool True
    mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10
    cpu_threadsMeaning:Number of threads used for inference on CPU.Meaning: Number of threads used for inference on CPU. int810
    paddlex_configMeaning:Path to the PaddleX pipeline configuration file.Meaning: Path to the PaddleX pipeline configuration file. str
    None
    text_recognition_model_nameMeaning:Name of the text recognition model.
    +Description: +If set to None, the pipeline's default model will be used.
    str|NoneNone
    text_recognition_model_dirMeaning:Directory path of the text recognition model.
    +Description: +If set to None, the official model will be downloaded.
    str|NoneNone
    text_recognition_batch_sizeMeaning:Batch size for the text recognition model.
    +Description: +If set to None, the default batch size will be 1.
    int|NoneNone
    use_doc_orientation_classify Meaning:Whether to load and use the document orientation classification module.
    Description: @@ -1309,7 +1393,7 @@ If set to None, the pipeline's initialized value for this parameter
    use_textline_orientationMeaning:Whether to load and use the text line orientation module.
    +
    Meaning:Whether to load and use the text line orientation module.
    Description: If set to None, the pipeline's initialized value for this parameter (defaults to True) will be used.
    bool|NoneNone
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    engine_configMeaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration.
    dict|NoneNone
    enable_hpi Meaning:Whether to enable high-performance inference. boolFalseNone
    use_tensorrt
    precisionMeaning:Computational precision, such as fp32, fp16.Meaning:Computational precision, such as "fp32", "fp16". str "fp32"
    cpu_threads Meaning:Number of threads used for CPU inference. int810
    paddlex_config
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    enable_hpi含义:是否启用高性能推理。 -
    说明:如果不设置,将使用默认值False
    含义:是否启用高性能推理。 boolFalseNone
    use_tensorrt含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
    说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    +
    含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
    +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    -
    bool False
    precision含义:计算精度,如 fp32、fp16。
    说明:如果不设置,将使用默认值fp32
    含义:计算精度,如 fp32fp16 str fp32
    enable_mkldnn含义:是否启用 MKL-DNN 加速推理。
    说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    +
    含义:是否启用 MKL-DNN 加速推理。
    +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    bool True
    mkldnn_cache_capacity -含义:MKL-DNN 缓存容量。
    说明:如果不设置,将使用默认值10
    int 10
    cpu_threads含义:在 CPU 上进行推理时使用的线程数。
    说明:如果不设置,将使用默认值8
    含义:在 CPU 上进行推理时使用的线程数。 int810
    paddlex_configstr
    以下参数在2.x版本已经废弃,为方便之前版本使用者故列出 @@ -1188,6 +1225,39 @@ for res in result: res.save_to_img("output") res.save_to_json("output") ``` + +上述代码默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import PaddleOCR + +ocr = PaddleOCR( + use_doc_orientation_classify=False, # 通过 use_doc_orientation_classify 参数指定不使用文档方向分类模型 + use_doc_unwarping=False, # 通过 use_doc_unwarping 参数指定不使用文本图像矫正模型 + use_textline_orientation=False, # 通过 use_textline_orientation 参数指定不使用文本行方向分类模型 + engine="transformers", +) +# ocr = PaddleOCR(lang="en", engine="transformers") # 通过 lang 参数来使用英文模型 +# ocr = PaddleOCR(ocr_version="PP-OCRv4") # 通过 ocr_version 参数来使用 PP-OCR 其他版本 +# ocr = PaddleOCR(device="gpu", engine="transformers") # 通过 device 参数使得在模型推理时使用 GPU +# ocr = PaddleOCR( +# text_detection_model_name="PP-OCRv5_server_det", +# text_recognition_model_name="PP-OCRv5_server_rec", +# use_doc_orientation_classify=False, +# use_doc_unwarping=False, +# use_textline_orientation=False, +# engine="transformers", +# ) # 更换 PP-OCRv5_server 模型 +result = ocr.predict("./general_ocr_002.png") +for res in result: + res.print() + res.save_to_img("output") + res.save_to_json("output") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 在上述 Python 脚本中,执行了如下几个步骤: @@ -1356,7 +1426,7 @@ for res in result:
    text_det_unclip_ratio -含义:文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。 +含义:文本检测扩张系数,使用该方法对文字区域进行扩张,该值越大,扩张的面积越大。
    说明:
    • float:大于0的任意浮点数; @@ -1433,10 +1503,22 @@ for res in result: None +engine +含义:推理引擎。
      说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见
      推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
      说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1450,7 +1532,7 @@ for res in result: precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -1473,7 +1555,7 @@ for res in result: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config diff --git a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md index 650e8a58433..d8e90b4c76e 100644 --- a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md +++ b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.en.md @@ -26,6 +26,7 @@ The PP-ChatOCRv4 pipeline includes the following 9 modules. Each module can be t In this pipeline, you can choose the model to use based on the benchmark data below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Normal Mode / High-Performance Mode], the `Normal Mode` values correspond to local Paddle inference engines. Each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`.
      Document Image Orientation Classification Module (Optional): @@ -927,7 +928,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Engine and Configuration. +str|None +None + + enable_hpi -Meaning:Whether to enable the high-performance inference plugin. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
      +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
      -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
      - +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Compute precision, such as FP32 or FP16. +Meaning: Computation precision, such as fp32 or fp16. str fp32 enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
      +Meaning: Whether to enable MKL-DNN accelerated inference.
      Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. bool True @@ -1434,23 +1450,24 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:The number of threads to use when performing inference on the CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str + @@ -1516,12 +1533,82 @@ vector_info = pipeline.build_vector( ) mllm_predict_res = pipeline.mllm_pred( input="vehicle_certificate-1.png", - key_list=["Cab Seating Capacity"], # Translated: 驾驶室准乘人数 + key_list=["驾驶室准乘人数"], + mllm_chat_bot_config=mllm_chat_bot_config, +) +mllm_predict_info = mllm_predict_res["mllm_res"] +chat_result = pipeline.chat( + key_list=["驾驶室准乘人数"], + visual_info=visual_info_list, + vector_info=vector_info, + mllm_predict_info=mllm_predict_info, + chat_bot_config=chat_bot_config, + retriever_config=retriever_config, +) +print(chat_result) + +``` + +The example above uses local Paddle inference engines by default. By default, each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import PPChatOCRv4Doc + +chat_bot_config = { + "module_name": "chat_bot", + "model_name": "ernie-3.5-8k", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +retriever_config = { + "module_name": "retriever", + "model_name": "embedding-v1", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "qianfan", + "api_key": "api_key", # your api_key +} + +mllm_chat_bot_config = { + "module_name": "chat_bot", + "model_name": "PP-DocBee2", + "base_url": "http://127.0.0.1:8080/", # your local mllm service url + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +pipeline = PPChatOCRv4Doc( + engine="transformers", +) + +visual_predict_res = pipeline.visual_predict( + input="vehicle_certificate-1.png", + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_common_ocr=True, + use_seal_recognition=True, + use_table_recognition=True, +) + +visual_info_list = [] +for res in visual_predict_res: + visual_info_list.append(res["visual_info"]) + layout_parsing_result = res["layout_parsing_result"] + +vector_info = pipeline.build_vector( + visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config +) +mllm_predict_res = pipeline.mllm_pred( + input="vehicle_certificate-1.png", + key_list=["驾驶室准乘人数"], mllm_chat_bot_config=mllm_chat_bot_config, ) mllm_predict_info = mllm_predict_res["mllm_res"] chat_result = pipeline.chat( - key_list=["Cab Seating Capacity"], # Translated: 驾驶室准乘人数 + key_list=["驾驶室准乘人数"], visual_info=visual_info_list, vector_info=vector_info, mllm_predict_info=mllm_predict_info, @@ -1988,34 +2075,46 @@ Supports specifying a specific card number: str|None None + +engine +Meaning: Inference engine.
      Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
      Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
      -Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
      -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
      - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Computation precision, e.g., fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
      -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
      +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. bool True @@ -2023,23 +2122,24 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:Number of threads used when performing inference on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:PaddleX pipeline configuration file path. +Meaning: Path to the PaddleX pipeline configuration file. str|None None +
      diff --git a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md index 8703eaa3101..41fe191ba02 100644 --- a/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md +++ b/docs/version3.x/pipeline_usage/PP-ChatOCRv4.md @@ -27,6 +27,7 @@ PP-ChatOCRv4 是飞桨特色的文档和图像智能分析解决方案,结合 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地飞桨推理引擎。各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。
      文档图像方向分类模块(可选): @@ -818,7 +819,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理引擎与配置说明。 +str|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt -含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。 -说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      +含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
      +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
      - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str fp32 enable_mkldnn 含义:是否启用 MKL-DNN 加速推理。
      -说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool True @@ -1298,7 +1317,7 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1306,6 +1325,7 @@ paddleocr pp_chatocrv4_doc -i vehicle_certificate-1.png -k 驾驶室准乘人数 str +
      @@ -1385,6 +1405,76 @@ print(chat_result) ``` +上述代码默认使用本地飞桨推理引擎。默认情况下,各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import PPChatOCRv4Doc + +chat_bot_config = { + "module_name": "chat_bot", + "model_name": "ernie-3.5-8k", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +retriever_config = { + "module_name": "retriever", + "model_name": "embedding-v1", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "qianfan", + "api_key": "api_key", # your api_key +} + +mllm_chat_bot_config = { + "module_name": "chat_bot", + "model_name": "PP-DocBee2", + "base_url": "http://127.0.0.1:8080/", # your local mllm service url + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +pipeline = PPChatOCRv4Doc( + engine="transformers", +) + +visual_predict_res = pipeline.visual_predict( + input="vehicle_certificate-1.png", + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_common_ocr=True, + use_seal_recognition=True, + use_table_recognition=True, +) + +visual_info_list = [] +for res in visual_predict_res: + visual_info_list.append(res["visual_info"]) + layout_parsing_result = res["layout_parsing_result"] + +vector_info = pipeline.build_vector( + visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config +) +mllm_predict_res = pipeline.mllm_pred( + input="vehicle_certificate-1.png", + key_list=["驾驶室准乘人数"], + mllm_chat_bot_config=mllm_chat_bot_config, +) +mllm_predict_info = mllm_predict_res["mllm_res"] +chat_result = pipeline.chat( + key_list=["驾驶室准乘人数"], + visual_info=visual_info_list, + vector_info=vector_info, + mllm_predict_info=mllm_predict_info, + chat_bot_config=chat_bot_config, + retriever_config=retriever_config, +) +print(chat_result) + +``` + 运行后,输出结果如下: ``` @@ -1819,11 +1909,24 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: str|None None + +engine +含义:推理引擎。
      说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
      说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1831,14 +1934,13 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: 说明: 如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
      - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -1863,7 +1965,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1977,6 +2079,7 @@ PP-ChatOCRv4 预测的流程、API说明、产出说明如下: str|None None + text_det_thresh 含义:参数含义与实例化参数基本相同。
      diff --git a/docs/version3.x/pipeline_usage/PP-DocTranslation.en.md b/docs/version3.x/pipeline_usage/PP-DocTranslation.en.md index c1ea73b858f..5de6704fdb6 100644 --- a/docs/version3.x/pipeline_usage/PP-DocTranslation.en.md +++ b/docs/version3.x/pipeline_usage/PP-DocTranslation.en.md @@ -13,6 +13,7 @@ PP-DocTranslation is a document intelligent translation solution provided by Pad In this pipeline, you can select the model to use based on the benchmark data below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Regular Mode / High-Performance Mode], the `Regular Mode` values correspond to local Paddle inference engines. Each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`.
      👉Model List Details

      Document Image Orientation Classification Module:

      @@ -644,7 +645,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Engine and Configuration. +str|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to enable the TensorRT subgraph engine of Paddle Inference.
      -Description: -If the model does not support acceleration by TensorRT, enabling this flag will not enable acceleration.
      -For PaddlePaddle with CUDA 11.8, compatible TensorRT version is 8.x (x≥6), recommended TensorRT version is 8.6.1.6.
      +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Computation precision, e.g. fp32, fp16. +Meaning: Computation precision, such as fp32 or fp16. str fp32 enable_mkldnn -Meaning:Whether to enable MKL-DNN accelerated inference. -Description: -If MKL-DNN is unavailable or the model does not support acceleration via MKL-DNN, enabling this flag will not enable acceleration. +Meaning: Whether to enable MKL-DNN accelerated inference.
      +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. + +Meaning: MKL-DNN cache capacity. + int 10 cpu_threads -Meaning:Number of threads used for inference on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str @@ -1353,7 +1373,72 @@ else: use_doc_unwarping=False, use_common_ocr=True, use_seal_recognition=True, -use_table_recognition=True, + use_table_recognition=True, + ) + + ori_md_info_list = [] + for res in visual_predict_res: + layout_parsing_result = res["layout_parsing_result"] + ori_md_info_list.append(layout_parsing_result.markdown) + layout_parsing_result.save_to_img(output_path) + layout_parsing_result.save_to_markdown(output_path) + + # Concatenate the markdown information of multi-page documents into a single markdown file, and save the merged original markdown text + if input_path.lower().endswith(".pdf"): + ori_md_info = pipeline.concatenate_markdown_pages(ori_md_info_list) + ori_md_info.save_to_markdown(output_path) + +# Perform document translation (target language: English) +tgt_md_info_list = pipeline.translate( + ori_md_info_list=ori_md_info_list, + target_language="en", + chunk_size=5000, + chat_bot_config=chat_bot_config, +) +# Save the translation results +for tgt_md_info in tgt_md_info_list: + tgt_md_info.save_to_markdown(output_path) +``` + +The example above uses local Paddle inference engines by default. By default, each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import PPDocTranslation + +# Create a translation pipeline +pipeline = PPDocTranslation( + engine="transformers", +) + +# Document path +input_path = "document_sample.pdf" + +# Output directory +output_path = "./output" + +# Large model configuration +chat_bot_config = { + "module_name": "chat_bot", + "model_name": "ernie-3.5-8k", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +if input_path.lower().endswith(".md"): + # Read markdown documents, supporting passing in directories and url links with the .md suffix + ori_md_info_list = pipeline.load_from_markdown(input_path) +else: + # Use PP-StructureV3 to perform layout parsing on PDF/image documents to obtain markdown information + visual_predict_res = pipeline.visual_predict( + input_path, + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_common_ocr=True, + use_seal_recognition=True, + use_table_recognition=True, ) ori_md_info_list = [] @@ -1999,55 +2084,71 @@ Supports specifying a specific card number: str|None None + +engine +Meaning: Inference engine.
      Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
      Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to enable Paddle Inference’s TensorRT subgraph engine.
      -Description: -If the model does not support acceleration via TensorRT, enabling this flag will have no effect.
      -For Paddle with CUDA 11.8, the compatible TensorRT version is 8.x (x≥6), recommended installation is TensorRT 8.6.1.6.
      +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Computation precision, such as fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN accelerated inference.
      -Description: -If MKL-DNN is unavailable or the model does not support acceleration via MKL-DNN, enabling this flag will have no effect. +Meaning: Whether to enable MKL-DNN accelerated inference.
      +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. + +Meaning: MKL-DNN cache capacity. + int 10 cpu_threads -Meaning:Number of threads used during inference on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to the PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str|None None + @@ -2893,6 +2994,12 @@ Below are the API references for basic serving and examples of multi-language se No +outputFormats +array | null +Optional. Additional export formats; currently only "docx" is supported. +No + + visualize boolean | null Whether to return visualization result images and intermediate images during processing. @@ -2926,12 +3033,12 @@ By default, images will not be returned; the visualize parameter in layoutParsingResults array -Layout parsing results. The array length is 1 (for image input) or equals the actual number of processed pages (for PDF input). For PDF input, each element corresponds to the result of each processed page in order. +Layout parsing results. The array length is 1 (for image input) or the number of document pages actually processed (for PDF input). For PDF input, each element in the array is the result of each processed page, in order. dataInfo object -Input data information. +Metadata about the input data. @@ -2965,6 +3072,11 @@ By default, images will not be returned; the visualize parameter in string | null Input image. JPEG format, Base64 encoded. + +exports +object | null +Optional additional exports such as docx, present only when outputFormats is requested. The content field is Base64-encoded file bytes. +

      markdown is an object with the following properties:

      @@ -2985,17 +3097,17 @@ By default, images will not be returned; the visualize parameter in images object -Key-value pairs of Markdown image relative paths and Base64 encoded images. +Mapping from relative Markdown image paths to Base64-encoded image data. isStart boolean -Whether the first element on the current page is the start of a paragraph. +Whether the first element on the current page starts a paragraph. isEnd boolean -Whether the last element on the current page is the end of a paragraph. +Whether the last element on the current page ends a paragraph. @@ -3122,7 +3234,7 @@ By default, images will not be returned; the visualize parameter in markdown object -Markdown result. Object definition is consistent with the markdown returned by the analyzeImages operation. +Markdown result; same shape as markdown returned by analyzeImages. diff --git a/docs/version3.x/pipeline_usage/PP-DocTranslation.md b/docs/version3.x/pipeline_usage/PP-DocTranslation.md index a2c48d96471..60934e118e4 100644 --- a/docs/version3.x/pipeline_usage/PP-DocTranslation.md +++ b/docs/version3.x/pipeline_usage/PP-DocTranslation.md @@ -13,6 +13,7 @@ PP-DocTranslation 是飞桨提供的文档智能翻译解决方案,融合了 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地飞桨推理引擎。各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。
      👉模型列表详情

      文档图像方向分类模块:

      @@ -641,7 +642,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理模型/推理引擎与配置说明。 +str|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1301,7 +1318,7 @@ paddleocr pp_doctranslation -i vehicle_certificate-1.png --target_language en -- precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str fp32 @@ -1326,7 +1343,7 @@ paddleocr pp_doctranslation -i vehicle_certificate-1.png --target_language en -- cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1334,6 +1351,7 @@ paddleocr pp_doctranslation -i vehicle_certificate-1.png --target_language en -- str +
      @@ -1404,6 +1422,71 @@ for tgt_md_info in tgt_md_info_list: tgt_md_info.save_to_markdown(output_path) ``` +上述代码默认使用本地飞桨推理引擎。默认情况下,各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import PPDocTranslation + +# 创建翻译产线 +pipeline = PPDocTranslation( + engine="transformers", +) + +# 文档路径 +input_path = "document_sample.pdf" + +# 输出目录 +output_path = "./output" + +# 大模型配置 +chat_bot_config = { + "module_name": "chat_bot", + "model_name": "ernie-3.5-8k", + "base_url": "https://qianfan.baidubce.com/v2", + "api_type": "openai", + "api_key": "api_key", # your api_key +} + +if input_path.lower().endswith(".md"): + # 读取markdown文档,支持传入目录和以 .md 为后缀的 url 链接 + ori_md_info_list = pipeline.load_from_markdown(input_path) +else: + # 使用 PP-StructureV3 对 PDF/图片 文档进行版面解析,获取markdown信息 + visual_predict_res = pipeline.visual_predict( + input_path, + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_common_ocr=True, + use_seal_recognition=True, + use_table_recognition=True, + ) + + ori_md_info_list = [] + for res in visual_predict_res: + layout_parsing_result = res["layout_parsing_result"] + ori_md_info_list.append(layout_parsing_result.markdown) + layout_parsing_result.save_to_img(output_path) + layout_parsing_result.save_to_markdown(output_path) + + # 将多页文档的 markdown 信息拼接成一个 markdown 文件,可将合并后的 markdown 原文保存 + if input_path.lower().endswith(".pdf"): + ori_md_info = pipeline.concatenate_markdown_pages(ori_md_info_list) + ori_md_info.save_to_markdown(output_path) + +# 执行文档翻译(目标语言:英文) +tgt_md_info_list = pipeline.translate( + ori_md_info_list=ori_md_info_list, + target_language="en", + chunk_size=5000, + chat_bot_config=chat_bot_config, +) +# 保存翻译结果 +for tgt_md_info in tgt_md_info_list: + tgt_md_info.save_to_markdown(output_path) +``` + 执行上述代码后,您将得到翻译原文的文档解析结果、翻译原文的 Markdown 文件和翻译后文档的 Markdown 文件,保存在 output 目录中。 PP-DocTranslation 预测的流程、API 说明、产出说明如下: @@ -2018,11 +2101,24 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: str|None None + +engine +含义:推理引擎。
      说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
      说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -2036,7 +2132,7 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -2061,7 +2157,7 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -2069,6 +2165,7 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: str|None None + @@ -2783,7 +2880,7 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: layoutThreshold -number | object | null +number | object | null 请参阅产线对象中 visual_predict 方法的 layout_threshold 参数相关说明。 否 @@ -2881,19 +2978,19 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: useWiredTableCellsTransToHtml boolean 请参阅产线对象中 visual_predict 方法的 use_wired_table_cells_trans_to_html 参数相关说明。 -No +否 useWirelessTableCellsTransToHtml boolean 请参阅产线对象中 visual_predict 方法的 use_wireless_table_cells_trans_to_html 参数相关说明。 -No +否 useTableOrientationClassify boolean 请参阅产线对象中 visual_predict 方法的 use_table_orientation_classify 参数相关说明。 -No +否 useOcrResultsWithTableCells @@ -2914,6 +3011,12 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: 否 +outputFormats +array | null +可选。附加导出格式列表;当前仅支持 "docx"。 +否 + + visualize boolean | null 是否返回可视化结果图以及处理过程中的中间图像等。 @@ -2987,6 +3090,11 @@ PP-DocTranslation 预测的流程、API 说明、产出说明如下: string | null 输入图像。图像为JPEG格式,使用Base64编码。 + +exports +object | null +可选的 docx 等附加导出,仅当请求 outputFormats 时出现。其中 content 为文件内容的Base64编码。 +

      markdown为一个object,具有如下属性:

      diff --git a/docs/version3.x/pipeline_usage/PP-StructureV3.en.md b/docs/version3.x/pipeline_usage/PP-StructureV3.en.md index 7f46b968215..e699dadaa8a 100644 --- a/docs/version3.x/pipeline_usage/PP-StructureV3.en.md +++ b/docs/version3.x/pipeline_usage/PP-StructureV3.en.md @@ -21,6 +21,7 @@ Layout analysis is a technique used to extract structured information from docum In this pipeline, you can choose the model to use based on the benchmark data below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Standard Mode / High-Performance Mode], [Normal Mode / High-Performance Mode], or [Regular Mode / High-Performance Mode], the Standard Mode, Normal Mode, and Regular Mode values correspond to local Paddle inference engines. Each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`.
      Document Image Orientation Classification Module : @@ -922,7 +923,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Engine and Configuration. +str|None +None + + enable_hpi -Meaning:Whether to enable high performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
      -Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
      -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
      - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Computation precision, e.g., fp32, fp16. +Meaning: Computation precision, such as fp32 or fp16. str fp32 enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
      -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
      +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:Number of threads to use when inferring on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to the PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str @@ -1673,6 +1691,31 @@ for res in output: res.save_to_word(save_path="output") ## Save the current image's result in Word format ``` +The example above uses local Paddle inference engines by default. By default, each module selects the appropriate local Paddle inference engine according to the default model name: models that support only dynamic graph use `paddle_dynamic`, while models that support both static and dynamic graph prefer `paddle_static`. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import PPStructureV3 + +# Some models are still being supported. For inference, please disable formula recognition and replace the wireless table structure recognition model using the following code: +pipeline = PPStructureV3( + engine="transformers", + use_formula_recognition=False, + wireless_table_structure_recognition_model_name="SLANeXt_wireless", +) +# pipeline = PPStructureV3(lang="en") # Set the lang parameter to use the English text recognition model. For other supported languages, see Section 5: Appendix. By default, both Chinese and English text recognition models are enabled. +# pipeline = PPStructureV3(use_doc_orientation_classify=True) # Use use_doc_orientation_classify to enable/disable document orientation classification model +# pipeline = PPStructureV3(use_doc_unwarping=True) # Use use_doc_unwarping to enable/disable document unwarping module +# pipeline = PPStructureV3(use_textline_orientation=True) # Use use_textline_orientation to enable/disable textline orientation classification model +# pipeline = PPStructureV3(device="gpu") # Use device to specify GPU for model inference +output = pipeline.predict("./pp_structure_v3_demo.png") +for res in output: + res.print() ## Print the structured prediction output + res.save_to_json(save_path="output") ## Save the current image's structured result in JSON format + res.save_to_markdown(save_path="output") ## Save the current image's result in Markdown format +``` + For PDF files, each page will be processed individually and generate a separate Markdown file. If you want to convert the entire PDF to a single Markdown file, use the following method: ```python @@ -2334,57 +2377,71 @@ Supports specifying device ID: str|None None + +engine +Meaning: Inference engine.
      Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
      Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
      -Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
      -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
      - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
      +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
      +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
      bool False precision -Meaning:Computation precision, e.g., fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
      -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
      +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:Number of threads used for inference on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to the PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str|None None + @@ -3082,7 +3139,7 @@ To remove the page limit, please add the following configuration to the pipeline layoutThreshold -number | object | null +number | object | null Please refer to the description of the layout_threshold parameter of the pipeline object's predict method. No @@ -3231,6 +3288,12 @@ To remove the page limit, please add the following configuration to the pipeline No +outputFormats +array | null +Optional list of extra formats to return. Currently only "docx" is supported. +No + + visualize boolean | null @@ -3306,6 +3369,11 @@ If neither the request body nor the configuration file is set (If visualiz string | null The input image. The image is in JPEG format and is Base64-encoded. + +exports +object | null +Optional additional exports when outputFormats is present—for example, {"docx": {"content": "..."}}, where content is the Base64-encoded file content. +

      markdown is an object with the following attributes:

      diff --git a/docs/version3.x/pipeline_usage/PP-StructureV3.md b/docs/version3.x/pipeline_usage/PP-StructureV3.md index 3e089517eb8..f6cbcddba66 100644 --- a/docs/version3.x/pipeline_usage/PP-StructureV3.md +++ b/docs/version3.x/pipeline_usage/PP-StructureV3.md @@ -21,6 +21,7 @@ comments: true 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地飞桨推理引擎。各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。
      文档图像方向分类模块: @@ -999,7 +1000,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理引擎与配置说明。 +str|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt -含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。 -
      说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      +含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
      +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
      bool @@ -1554,14 +1573,15 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str fp32 enable_mkldnn -含义:是否启用 MKL-DNN 加速推理。 -
      说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
      +含义:是否启用 MKL-DNN 加速推理。
      +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool True @@ -1578,7 +1598,7 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1586,6 +1606,7 @@ paddleocr pp_structurev3 -i ./pp_structure_v3_demo.png --device gpu str +
      @@ -1642,6 +1663,31 @@ for res in output: res.save_to_word(save_path="output") ## 保存当前图像的Word格式的结果 ``` +上述代码默认使用本地飞桨推理引擎。默认情况下,各模块会根据默认模型名称选择合适的本地飞桨推理引擎:仅支持动态图的模型使用 `paddle_dynamic`;同时支持静态图和动态图的模型优先使用 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import PPStructureV3 + +# 部分模型尚在支持中,推理时需关闭公式识别功能并更换无线表格结构识别模型,请使用以下代码: +pipeline = PPStructureV3( + engine="transformers", + use_formula_recognition=False, + wireless_table_structure_recognition_model_name="SLANeXt_wireless", +) +# pipeline = PPStructureV3(lang="en") # 将 lang 参数设置为使用英文文本识别模型。对于其他支持的语言,请参阅第5节:附录部分。默认配置为中英文模型。 +# pipeline = PPStructureV3(use_doc_orientation_classify=True) # 通过 use_doc_orientation_classify 指定是否使用文档方向分类模型 +# pipeline = PPStructureV3(use_doc_unwarping=True) # 通过 use_doc_unwarping 指定是否使用文本图像矫正模块 +# pipeline = PPStructureV3(use_textline_orientation=True) # 通过 use_textline_orientation 指定是否使用文本行方向分类模型 +# pipeline = PPStructureV3(device="gpu") # 通过 device 指定模型推理时使用 GPU +output = pipeline.predict("./pp_structure_v3_demo.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 + res.save_to_markdown(save_path="output") ## 保存当前图像的markdown格式的结果 +``` + 如果是 PDF 文件,会将 PDF 的每一页单独处理,每一页的 Markdown 文件也会对应单独的结果。如果希望整个 PDF 文件转换为 Markdown 文件,建议使用以下的方式运行: ```python @@ -2255,32 +2301,46 @@ for item in markdown_images: str|None None + +engine +含义:推理引擎。
      说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
      说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt -含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。 -
      说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      +含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
      +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
      对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
      - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str -fp32 +"fp32" enable_mkldnn -含义:是否启用 MKL-DNN 加速推理。 -
      说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 +含义:是否启用 MKL-DNN 加速推理。
      +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool True @@ -2297,7 +2357,7 @@ for item in markdown_images: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -2305,6 +2365,7 @@ for item in markdown_images: str|None None + @@ -2968,7 +3029,7 @@ for item in markdown_images: layoutThreshold -number | object |
      null +number | object | null 请参阅产线对象中 predict 方法的 layout_threshold 参数相关说明。 否 @@ -3117,6 +3178,12 @@ for item in markdown_images: 否 +outputFormats +array | null +可选。附加导出格式列表,默认不返回。当前仅支持 "docx"。 +否 + + visualize boolean | null 是否返回可视化结果图以及处理过程中的中间图像等。 @@ -3189,6 +3256,11 @@ for item in markdown_images: string | null 输入图像。图像为JPEG格式,使用Base64编码。 + +exports +object | null +可选的附加导出结果。仅当请求中包含 outputFormats 时出现,例如 {"docx": {"content": "..."}},其中 content 为文件内容的Base64编码。 +

      markdown为一个object,具有如下属性:

      diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.en.md index 6ae3b1b06cd..b0946d187a7 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.en.md @@ -7,7 +7,7 @@ comments: true > INFO: > Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. -This tutorial is a guide for using PaddleOCR-VL on AMD GPU covering the complete workflow from environment preparation to service deployment. +This tutorial is a guide for using PaddleOCR-VL on AMD GPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the AMD MI300X. However, due to hardware diversity, compatibility with other AMD GPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. @@ -109,7 +109,7 @@ docker run -it \ > TIP: > Images with the `latest-xxx` tag correspond to the latest version of PaddleOCR. If you want to use a specific version of the PaddleOCR image, you can replace `latest` in the tag with the desired version number: `paddleocr.`. > For example: -> `ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:paddleocr3.3-amd-gpu-offline` +> `ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:paddleocr3.4-amd-gpu-offline` ### 3.2 Client Usage Method diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.md index 167237532f7..3d43fdef0d4 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-AMD-GPU.md @@ -12,7 +12,7 @@ comments: true 目前 PaddleOCR-VL 已在 AMD MI300X 上完成精度、速度验证;鉴于硬件环境的多样性,其他 AMD GPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 > TIP: -> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 ## 1. 环境准备 @@ -48,7 +48,7 @@ docker run -it \ ### 1.2 方法二:手动安装 PaddlePaddle 和 PaddleOCR -如果您无法使用 Docker,也可以手动安装 PaddlePaddle 和 PaddleOCR。要求 Python 版本为 3.8–3.12。 +如果您无法使用 Docker,也可以手动安装 PaddlePaddle 和 PaddleOCR。要求 Python 版本为 3.8–3.13。 **我们强烈推荐您在虚拟环境中安装 PaddleOCR-VL,以避免发生依赖冲突。** 例如,使用 Python venv 标准库创建虚拟环境: @@ -145,7 +145,7 @@ docker run -it \ paddleocr-vl-api | INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) ``` -此方式基于 FastDeploy 框架对 VLM 推理进行加速,更适合生产环境部署。 +此方式基于 vLLM 框架对 VLM 推理进行加速,更适合生产环境部署。 此外,使用此方式启动服务器后,除拉取镜像外,无需连接互联网。如需在离线环境中部署,可先在联网机器上拉取 Compose 文件中涉及的镜像,导出并传输至离线机器中导入,即可在离线环境下启动服务。 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.en.md index db8bb5762c2..e31b2a6b34b 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.en.md @@ -4,6 +4,9 @@ comments: true # PaddleOCR-VL Apple Silicon Usage Tutorial +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + This tutorial is a guide for using PaddleOCR-VL on Apple Silicon, covering the complete workflow from environment preparation to service deployment. Apple Silicon include, but are not limited to: @@ -15,6 +18,9 @@ Apple Silicon include, but are not limited to: PaddleOCR-VL has been verified for accuracy and speed on the Apple M4. However, due to hardware diversity, compatibility with other Apple Silicon has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation **We strongly recommend installing PaddleOCR-VL in a virtual environment to avoid dependency conflicts.** For example, use the Python venv standard library to create a virtual environment: @@ -39,9 +45,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start). -## 3. Improving VLM Inference Performance Using Inference Acceleration Frameworks +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the MLX-VLM inference acceleration framework to improve the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use MLX-VLM as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -59,6 +65,8 @@ mlx_vlm.server --port 8111 ### 3.2 Client Usage Method +The following invocation methods apply to an already launched MLX-VLM inference service. + #### 3.2.1 Command Line Usage You can specify the backend type (`mlx-vlm-server`) via `--vl_rec_backend`, the service address via `--vl_rec_server_url`, and the huggingface repo id or server-side model weights path via `--vl_rec_api_model_name`. For example: diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.md index ed5842bc71b..711623c72a6 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Apple-Silicon.md @@ -4,6 +4,9 @@ comments: true # PaddleOCR-VL Apple Silicon 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在 Apple Silicon 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 Apple Silicon 包括但不限于以下几种: @@ -15,6 +18,9 @@ Apple Silicon 包括但不限于以下几种: 目前 PaddleOCR-VL 已在 Apple M4 上完成精度验证;鉴于硬件环境的多样性,其他 Apple Silicon 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 **我们强烈推荐您在虚拟环境中安装 PaddleOCR-VL,以避免发生依赖冲突。** 例如,使用 Python venv 标准库创建虚拟环境: @@ -39,9 +45,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考[PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2)。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 MLX-VLM 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 MLX-VLM 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -59,6 +65,8 @@ mlx_vlm.server --port 8111 ### 3.2 客户端使用方法 +以下调用方式适用于已启动的 MLX-VLM 推理服务。 + #### 3.2.1 CLI 调用 可通过 `--vl_rec_backend` 指定后端类型(`mlx-vlm-server`),通过 `--vl_rec_server_url` 指定服务地址,通过 `--vl_rec_api_model_name` 指定 huggingface repo id 或服务端模型权重路径,例如: diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.en.md index 6386ee10c00..08ab1acd48e 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.en.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL Huawei Ascend NPU Usage Tutorial -This tutorial is a guide for using PaddleOCR-VL on Huawei Ascend NPU covering the complete workflow from environment preparation to service deployment. +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + +This tutorial is a guide for using PaddleOCR-VL on Huawei Ascend NPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the Huawei Ascend 910B. However, due to hardware diversity, compatibility with other Huawei Ascend NPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This step mainly introduces how to set up the runtime environment for PaddleOCR-VL. There are two methods available; choose either one: @@ -70,9 +76,9 @@ python -m pip install -U "paddleocr[doc-parser]" The NPU currently does not support inference using the `PaddlePaddle` inference method. Please refer to the next section on using the `vLLM` inference acceleration framework for inference. -## 3. Improving VLM Inference Performance Using Inference Acceleration Framework +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the vLLM inference acceleration framework to improve the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use vLLM as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -116,7 +122,7 @@ docker run -it \ ### 3.2 Client Usage Method -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="npu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.md index ed31595d029..7e348daa36e 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Huawei-Ascend-NPU.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL 华为昇腾 NPU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在华为昇腾 NPU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 目前 PaddleOCR-VL 已在华为昇腾 910B 上完成精度、速度验证;鉴于硬件环境的多样性,其他华为昇腾 NPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -70,9 +76,9 @@ python -m pip install -U "paddleocr[doc-parser]" NPU 暂时不支持使用 PaddlePaddle 推理方式推理,请参考使用下一节使用 vLLM 推理加速框架推理。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -此步骤主要介绍如何使用 vLLM 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 vLLM 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -116,7 +122,7 @@ docker run -it \ ### 3.2 客户端使用方法 -请参考[PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="npu"`。 ### 3.3 性能调优 @@ -148,7 +154,7 @@ docker run -it \ paddleocr-vl-api | INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) ``` -此方式基于 FastDeploy 框架对 VLM 推理进行加速,更适合生产环境部署。 +此方式基于 vLLM 框架对 VLM 推理进行加速,更适合生产环境部署。 此外,使用此方式启动服务器后,除拉取镜像外,无需连接互联网。如需在离线环境中部署,可先在联网机器上拉取 Compose 文件中涉及的镜像,导出并传输至离线机器中导入,即可在离线环境下启动服务。 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.en.md index b9fa94db9d3..015086d6dae 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.en.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL Hygon DCU Usage Tutorial -This tutorial is a guide for using PaddleOCR-VL on Hygon DCU covering the complete workflow from environment preparation to service deployment. +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + +This tutorial is a guide for using PaddleOCR-VL on Hygon DCU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the Hygon K100AI. However, due to hardware diversity, compatibility with other Hygon DCUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This step mainly introduces how to set up the runtime environment for PaddleOCR-VL. There are two methods available; choose either one: @@ -74,9 +80,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start), making sure to specify `device="dcu"`. -## 3. Improving VLM Inference Performance Using Inference Acceleration Framework +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the vLLM inference acceleration framework to improve the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use vLLM as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -129,7 +135,7 @@ docker run -it \ ### 3.2 Client Usage Method -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="dcu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.md index fb7d31c2d58..dcf233e5543 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Hygon-DCU.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL 海光 DCU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在海光 DCU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 目前 PaddleOCR-VL 已在海光 K100AI 上完成精度、速度验证;鉴于硬件环境的多样性,其他海光 DCU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -74,9 +80,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考[PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2),注意需要指定 `device="dcu"`。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 vLLM 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 vLLM 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -128,7 +134,7 @@ docker run -it \ ### 3.2 客户端使用方法 -请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="dcu"`。 ### 3.3 性能调优 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.en.md index 0aa23859bf4..bae1a3932a5 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.en.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL Iluvatar GPU Usage Tutorial -This tutorial is a guide for using PaddleOCR-VL on Iluvatar GPU covering the complete workflow from environment preparation to service deployment. +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + +This tutorial is a guide for using PaddleOCR-VL on Iluvatar GPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the Iluvatar BI-V150. However, due to hardware diversity, compatibility with other Iluvatar GPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This step mainly introduces how to set up the runtime environment for PaddleOCR-VL. There are two methods available; choose either one: @@ -72,9 +78,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start), making sure to specify `device="iluvatar_gpu"`. -## 3. Improving VLM Inference Performance Using Inference Acceleration Framework +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the FastDeploy inference acceleration framework to improve the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use FastDeploy as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -122,7 +128,7 @@ docker run -it \ ### 3.2 Client Usage Method -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="iluvatar_gpu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.md index a886a90f007..c4e188d5581 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Iluvatar-GPU.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL 天数 GPU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在天数 GPU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 目前 PaddleOCR-VL 已在天数天垓 150 上完成精度、速度验证;鉴于硬件环境的多样性,其他天数 GPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -72,9 +78,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考 [PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2),注意需要指定 `device="iluvatar_gpu"`。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 FastDeploy 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 FastDeploy 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -122,7 +128,7 @@ docker run -it \ ### 3.2 客户端使用方法 -请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="iluvatar_gpu"`。 ### 3.3 性能调优 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.en.md index ab3374e78dc..dd5ccabdc08 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.en.md @@ -7,7 +7,7 @@ comments: true > INFO: > Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. -This tutorial is a guide for using PaddleOCR-VL on Intel Arc GPU covering the complete workflow from environment preparation to service deployment. +This tutorial is a guide for using PaddleOCR-VL on Intel Arc GPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the Intel Arc B60 Pro. However, due to hardware diversity, compatibility with other Intel Arc GPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.md index b88333bf1da..de8d6a49fed 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Intel-Arc-GPU.md @@ -12,7 +12,7 @@ comments: true 目前 PaddleOCR-VL 已在 Intel Arc B60 Pro 上完成精度、速度验证;鉴于硬件环境的多样性,其他 Intel Arc GPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 > TIP: -> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 ## 1. 环境准备 @@ -48,7 +48,7 @@ docker run -it \ ### 1.2 方法二:手动安装 PaddlePaddle 和 PaddleOCR -如果您无法使用 Docker,也可以手动安装 PaddlePaddle 和 PaddleOCR。要求 Python 版本为 3.8–3.12。 +如果您无法使用 Docker,也可以手动安装 PaddlePaddle 和 PaddleOCR。要求 Python 版本为 3.8–3.13。 **我们强烈推荐您在虚拟环境中安装 PaddleOCR-VL,以避免发生依赖冲突。** 例如,使用 Python venv 标准库创建虚拟环境: @@ -70,7 +70,7 @@ python -m pip install -U "paddleocr[doc-parser]" ## 2. 快速开始 -Intel Arc GPU 暂时不支持使用 PaddlePaddle 推理方式推理,请参考使用下一节使用 vLLM 推理加速框架推理。 +Intel Arc GPU 暂不支持 PaddlePaddle 推理方式,请参考下一节使用 vLLM 推理加速框架进行推理。 ## 3. 使用 VLM 推理服务提升推理性能 @@ -145,7 +145,7 @@ docker run -it \ paddleocr-vl-api | INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) ``` -此方式基于 FastDeploy 框架对 VLM 推理进行加速,更适合生产环境部署。 +此方式基于 vLLM 框架对 VLM 推理进行加速,更适合生产环境部署。 此外,使用此方式启动服务器后,除拉取镜像外,无需连接互联网。如需在离线环境中部署,可先在联网机器上拉取 Compose 文件中涉及的镜像,导出并传输至离线机器中导入,即可在离线环境下启动服务。 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.en.md index ae8f156af08..6c3edc08e8a 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.en.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL Kunlunxin XPU Usage Tutorial +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + This tutorial is a guide for using PaddleOCR-VL on Kunlunxin XPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the Kunlunxin P800. However, due to hardware diversity, compatibility with other Kunlunxin XPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This step mainly introduces how to set up the runtime environment for PaddleOCR-VL. There are two methods available; choose one as needed: @@ -67,9 +73,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start), making sure to specify `device='xpu'`. -## 3. Enhancing VLM Inference Performance Using Inference Acceleration Framework +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the FastDeploy inference acceleration framework to enhance the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use FastDeploy as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -110,7 +116,7 @@ docker run \ ### 3.2 Client Usage Method -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Method](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="xpu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.md index d4a087db4cb..8cfe4417aee 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-Kunlunxin-XPU.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL 昆仑芯 XPU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在昆仑芯 XPU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 目前 PaddleOCR-VL 已在昆仑芯 P800 上完成精度、速度验证;鉴于硬件环境的多样性,其他昆仑芯 XPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -67,9 +73,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考 [PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2),注意需要指定 `device="xpu"`。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 FastDeploy 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 FastDeploy 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -110,7 +116,7 @@ docker run \ ### 3.2 客户端使用方法 -请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="xpu"`。 ### 3.3 性能调优 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.en.md index 8abd65cb666..eeecb33f96a 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.en.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL MetaX GPU Usage Tutorial +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + This tutorial is a guide for using PaddleOCR-VL on MetaX GPU, covering the complete workflow from environment preparation to service deployment. PaddleOCR-VL has been verified for accuracy and speed on the MetaX C550. However, due to hardware diversity, compatibility with other MetaX GPUs has not yet been confirmed. We welcome the community to test on different hardware setups and share your results. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This step mainly introduces how to set up the runtime environment for PaddleOCR-VL. There are two methods available; choose either one: @@ -69,9 +75,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start), making sure to specify `device='metax_gpu'`. -## 3. Improving VLM Inference Performance Using Inference Acceleration Framework +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step mainly introduces how to use the FastDeploy inference acceleration framework to improve the inference performance of PaddleOCR-VL. +The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use FastDeploy as the backend for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -117,7 +123,7 @@ docker run -it \ ### 3.2 Client Usage Method -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Method](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="metax_gpu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.md index cc7230e06f6..5a115803701 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-MetaX-GPU.md @@ -4,10 +4,16 @@ comments: true # PaddleOCR-VL 沐曦 GPU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在沐曦 GPU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 目前 PaddleOCR-VL 已在沐曦 C550 上完成精度、速度验证;鉴于硬件环境的多样性,其他沐曦 GPU 的兼容性尚未验证。我们诚挚欢迎社区用户在不同硬件上进行测试并反馈您的运行结果。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -69,9 +75,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考 [PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2),注意需要指定 `device="metax_gpu"`。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 FastDeploy 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 FastDeploy 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -117,7 +123,7 @@ docker run -it \ ### 3.2 客户端使用方法 -请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="metax_gpu"`。 ### 3.3 性能调优 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.en.md index 548ca169ced..84fe228f1c7 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.en.md @@ -4,6 +4,9 @@ comments: true # PaddleOCR-VL NVIDIA Blackwell-Architecture GPUs Usage Tutorial +> INFO: +> Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + This tutorial provides guidance on using PaddleOCR-VL on NVIDIA Blackwell-architecture GPUs, covering the complete workflow from environment preparation to service deployment. NVIDIA Blackwell-architecture GPUs include, but are not limited to: @@ -18,6 +21,9 @@ PaddleOCR-VL has been verified for accuracy and speed on the RTX 5070. However, Before starting the tutorial, **please ensure that your NVIDIA driver supports CUDA 12.9 or higher**. +> TIP: +> Before reading this hardware-specific tutorial, we recommend first reading the [Process Guide](./PaddleOCR-VL.en.md#process-guide) in the main [PaddleOCR-VL Usage Tutorial](./PaddleOCR-VL.en.md) to determine which chapters apply to your goal, and then returning here to read the corresponding sections. + ## 1. Environment Preparation This section introduces how to set up the PaddleOCR-VL runtime environment using one of the following two methods: @@ -77,9 +83,9 @@ python -m pip install -U "paddleocr[doc-parser]" Please refer to [PaddleOCR-VL Usage Tutorial - 2. Quick Start](./PaddleOCR-VL.en.md#2-quick-start). -## 3. Improving VLM Inference Performance Using Inference Acceleration Frameworks +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations may not be fully optimized and may not meet actual production requirements. This section introduces how to use the vLLM and SGLang inference acceleration frameworks to enhance PaddleOCR-VL's inference performance. +The inference performance under default configurations may not be fully optimized and may not meet actual production requirements. This section introduces how to improve PaddleOCR-VL inference performance through a VLM inference service. In this hardware-specific guide, the examples use vLLM and SGLang as the backends for the VLM inference service. ### 3.1 Starting the VLM Inference Service @@ -126,7 +132,7 @@ docker run \ #### 3.1.2 Method 2: Installation and Usage via PaddleOCR CLI -Due to potential dependency conflicts between inference acceleration frameworks and PaddlePaddle, it is recommended to install them in a virtual environment: +Since inference acceleration frameworks may conflict with packages already installed in the current environment, it is recommended to install them in a virtual environment: ```shell # If a virtual environment is currently activated, deactivate it first using `deactivate` @@ -155,6 +161,9 @@ paddleocr install_genai_server_deps Currently supported framework names are `vllm` and `sglang`, corresponding to vLLM and SGLang, respectively. +> WARNING: +> The transformers library versions required by vLLM, SGLang and Transformers engine are currently incompatible, so Transformers engine cannot be installed together with vLLM or SGLang in the same environment. If using Transformers + vLLM or Transformers + SGLang inference, please deploy the layout detection model and VLM service in different environments. + After installation, you can start the service using the `paddleocr genai_server` command: ```shell @@ -174,7 +183,7 @@ The parameters supported by this command are as follows: ### 3.2 Client Usage -Please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage](./PaddleOCR-VL.en.md#32-client-usage-methods). +For client-side invocation methods, please refer to [PaddleOCR-VL Usage Tutorial - 3.2 Client Usage Methods](./PaddleOCR-VL.en.md#32-client-usage-methods). If you run the client on this hardware, make sure to specify `device="gpu"`. ### 3.3 Performance Tuning diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.md index 7441b7b9349..1fb1b93f748 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL-NVIDIA-Blackwell.md @@ -4,6 +4,9 @@ comments: true # PaddleOCR-VL NVIDIA Blackwell 架构 GPU 使用教程 +> INFO: +> 除非另有说明,本教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + 本教程是 PaddleOCR-VL 在 NVIDIA Blackwell 架构 GPU 上的使用指南,涵盖了从环境准备到服务化部署的完整流程。 NVIDIA Blackwell 架构 GPU 包括但不限于以下几种: @@ -18,6 +21,9 @@ NVIDIA Blackwell 架构 GPU 包括但不限于以下几种: 教程开始前,**请确认您的 NVIDIA 驱动支持 CUDA 12.9 或以上版本**。 +> TIP: +> 建议先阅读 [PaddleOCR-VL 使用教程](./PaddleOCR-VL.md) 中的 [流程导览](./PaddleOCR-VL.md#流程导览),根据您的使用目标确认应阅读哪些章节;再回到当前硬件教程阅读对应章节。 + ## 1. 环境准备 此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: @@ -77,9 +83,9 @@ python -m pip install -U "paddleocr[doc-parser]" 请参考 [PaddleOCR-VL 使用教程 - 2. 快速开始](./PaddleOCR-VL.md#2)。 -## 3. 使用推理加速框架提升 VLM 推理性能 +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 vLLM 和 SGLang 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。在当前硬件文档中,示例使用 vLLM 和 SGLang 作为 VLM 推理服务后端。 ### 3.1 启动 VLM 推理服务 @@ -126,7 +132,7 @@ docker run \ #### 3.1.2 方法二:通过 PaddleOCR CLI 安装和使用 -由于推理加速框架可能与飞桨框架存在依赖冲突,建议在虚拟环境中安装: +由于推理加速框架可能与当前环境中的包存在依赖冲突,建议在虚拟环境中安装: ```shell # 如果当前存在已激活的虚拟环境,先通过 `deactivate` 取消激活 @@ -155,6 +161,9 @@ paddleocr install_genai_server_deps <推理加速框架名称> 当前支持的框架名称为 `vllm` 和 `sglang`,分别对应 vLLM 和 SGLang。 +> WARNING: +> 目前 vLLM 和 SGLang 与 Transformers 引擎所需的 transformers 库版本存在冲突,因此同一环境中无法同时安装 Transformers 引擎与 vLLM 或 SGLang。如果使用 Transformers + vLLM 或 Transformers + SGLang 的推理方式,请将版面检测模型和 VLM 服务部署在不同环境中。 + 安装完成后,可通过 `paddleocr genai_server` 命令启动服务: ```shell @@ -174,7 +183,7 @@ paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --backend vllm --port ### 3.2 客户端使用方法 -请参考[PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。 +客户端调用方式请参考 [PaddleOCR-VL 使用教程 - 3.2 客户端使用方法](./PaddleOCR-VL.md#32)。如需在当前硬件上运行客户端,请注意指定 `device="gpu"`。 ### 3.3 性能调优 diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL.en.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL.en.md index f34f6489717..6b03a0ef1d8 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL.en.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL.en.md @@ -4,45 +4,78 @@ comments: true # PaddleOCR-VL Usage Tutorial -PaddleOCR-VL is an advanced and efficient document parsing model designed specifically for element recognition in documents. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful Vision-Language Model (VLM) composed of a NaViT-style dynamic resolution visual encoder and the ERNIE-4.5-0.3B language model, enabling precise element recognition. The model supports 109 languages and excels in recognizing complex elements (such as text, tables, formulas, and charts) while maintaining extremely low resource consumption. Comprehensive evaluations on widely used public benchmarks and internal benchmarks demonstrate that PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing Pipeline-based solutions, document parsing multimodal schemes, and advanced general-purpose multimodal large models, while offering faster inference speeds. These advantages make it highly suitable for deployment in real-world scenarios. +> INFO: +> PaddleOCR provides a unified interface for the PaddleOCR-VL model series to facilitate quick setup and usage. Unless otherwise specified, the term "PaddleOCR-VL" in this tutorial and related hardware usage tutorials refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. + +PaddleOCR-VL is an advanced and efficient document parsing model designed specifically for element recognition in documents. Taking its initial version (PaddleOCR-VL v1) as an example, its core component is PaddleOCR-VL-0.9B, a compact yet powerful Vision-Language Model (VLM) composed of a NaViT-style dynamic resolution visual encoder and the ERNIE-4.5-0.3B language model, enabling precise element recognition. The model series supports 109 languages and excels in recognizing complex elements (such as text, tables, formulas, and charts) while maintaining extremely low resource consumption. Comprehensive evaluations on widely used public benchmarks and internal benchmarks demonstrate that PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing Pipeline-based solutions, document parsing multimodal schemes, and advanced general-purpose multimodal large models, while offering faster inference speeds. These advantages make it highly suitable for deployment in real-world scenarios. **On January 29, 2026, we released PaddleOCR-VL-1.5. PaddleOCR-VL-1.5 not only significantly improved the accuracy on the OmniDocBench v1.5 evaluation set to 94.5%, but also innovatively supports irregular-shaped bounding box localization. As a result, PaddleOCR-VL-1.5 demonstrates outstanding performance in real-world scenarios such as Skew, Warping, Screen Photography, Illumination, and Scanning. In addition, the model has added new capabilities for seal (stamp) recognition and text detection and recognition, with key metrics continuing to lead the industry.** -> INFO: -> PaddleOCR provides a unified interface for the PaddleOCR-VL model series to facilitate quick setup and usage. Unless otherwise specified, the term "PaddleOCR-VL" in subsequent sections and related usage tutorials refers to the PaddleOCR-VL model series (e.g., PaddleOCR-VL-1.5). References specific to the PaddleOCR-VL v1 version will be explicitly noted. - ## Process Guide -Before starting, please refer to the next section for information on the inference device support provided by PaddleOCR-VL to **determine if your device meets the operational requirements.** If your device meets the requirements, please select the relevant section to read based on your needs. +You can first choose a reading path based on your goal, and then confirm whether you should continue with this tutorial or switch to the corresponding hardware-specific tutorial for the same chapter. + +Before getting started, we recommend first identifying your device type: + +- **x64 CPU**: You can read this tutorial directly. +- **NVIDIA GPU**: + - If you are using a **Blackwell-architecture GPU** such as the RTX 50 series, we recommend first continuing with this process guide to determine your goal, and then referring to the corresponding chapters in the [PaddleOCR-VL NVIDIA Blackwell Architecture GPU Usage Tutorial](./PaddleOCR-VL-NVIDIA-Blackwell.en.md). + - For other NVIDIA GPUs, you can read this tutorial directly. +- **Apple Silicon, Kunlunxin XPU, Hygon DCU, MetaX GPU, Iluvatar GPU, and Huawei Ascend NPU**: We recommend first continuing with this process guide to determine your goal, and then referring to the corresponding chapters in the dedicated tutorial for your hardware. + +Before proceeding directly to the following sections along the path described above, if you need to confirm which inference methods PaddleOCR-VL supports in your current hardware environment (for example, using the PaddlePaddle framework as the inference engine), please continue to the next section, “Inference Device Support for PaddleOCR-VL”. -For some inference hardware, you may need to refer to other usage tutorials we provide, but the process remains the same and does not affect your reading of the following process guide: +After confirming the above, choose your reading path based on your goal: -1. **Want to quickly experience PaddleOCR-VL**: +1. **Local Direct Inference (Quick Experience / Script Integration)**: - If you wish to quickly experience the inference effects of PaddleOCR-VL, please read [1. Environment Preparation](#1-environment-preparation) and [2. Quick Start](#2-quick-start), or the corresponding chapters in documentation for other hardware. + Suitable for directly calling PaddleOCR-VL on the local machine through the PaddleOCR CLI or Python API. + This category usually corresponds to local inference engine methods such as PaddlePaddle or Transformers. -2. **Want to use PaddleOCR-VL in a production environment**: + Please read [1. Environment Preparation](#1-environment-preparation) and [2. Quick Start](#2-quick-start), or the corresponding chapters in the hardware-specific tutorial. - Although the quick experience allows you to feel the effects of PaddleOCR-VL, it may not be optimal in terms of inference speed and GPU memory usage. If you wish to apply PaddleOCR-VL in a production environment and have higher requirements for inference performance, please read [3. Enhancing VLM Inference Performance Using Inference Acceleration Frameworks](#3-enhancing-vlm-inference-performance-using-inference-acceleration-frameworks) or the corresponding chapter in documentation for other hardware. +2. **Client with a VLM Inference Service (Performance-Focused)**: -3. **Want to deploy PaddleOCR-VL as an API service**: + Suitable for offloading only the VLM stage to a dedicated inference service for better performance. You can either deploy your own VLM inference service based on backends such as `vLLM`, `SGLang`, `FastDeploy`, `MLX-VLM`, and `llama.cpp`, or directly use a compatible managed service. + This category usually corresponds to combinations of "Layout Detection Inference Method + VLM Inference Service". - You can deploy PaddleOCR-VL as a web service (API), allowing client applications to invoke PaddleOCR-VL's capabilities through a specific URL without configuring the environment. If concurrent request processing is not required, choose either of the following two methods: + It is recommended to first complete the basic local direct inference flow described in the previous item, and then continue with [3. Improving Inference Performance with VLM Inference Services](#3-vlm) or the corresponding chapters in the hardware-specific tutorial. - - Deployment using Docker Compose (one-click start, recommended): Please read [4.1 Method 1: Deploy Using Docker Compose](#41-method-1-deploy-using-docker-compose-recommended) and [4.3 Client-Side Invocation](#43-client-side-invocation), or the corresponding chapters in documentation for other hardware. - - Manual deployment: Please read [1. Environment Preparation](#1-environment-preparation), [4.2 Method 2: Manual Deployment](#42-method-2-manual-deployment), and [4.3 Client-Side Invocation](#43-client-side-invocation), or the corresponding chapters in documentation for other hardware. + Note that **Section 3 launches a VLM inference service, not the full PaddleOCR-VL API service**. Other stages such as layout detection are still executed on the client side. + +3. **Deploy the Full API Service**: + + Suitable for packaging the full PaddleOCR-VL capability as a web service so that the client only needs to call it through an HTTP interface. Unlike the previous option, what is deployed here is an API service that directly exposes the complete PaddleOCR-VL capability, rather than a backend service that is only responsible for VLM inference. If you do not have special requirements for concurrent request processing, choose either of the following: + + - Deployment using Docker Compose (one-click startup, recommended): this uses the "PaddlePaddle + VLM Inference Service" inference method, where the underlying VLM service uses an inference acceleration framework. Please read [4.1 Method 1: Deploy Using Docker Compose](#41-method-1-deploy-using-docker-compose-recommended) and [4.3 Client-Side Invocation](#43-client-side-invocation), or the corresponding chapters in the hardware-specific tutorial. + - Manual deployment: by default, this uses PaddlePaddle inference. You can also switch to Transformers, or configure a VLM inference service to form a "Layout Detection Inference Method + VLM Inference Service" combination. Please read [1. Environment Preparation](#1-environment-preparation), [4.2 Method 2: Manual Deployment](#42-method-2-manual-deployment), and [4.3 Client-Side Invocation](#43-client-side-invocation), or the corresponding chapters in the hardware-specific tutorial. For concurrent request processing, please refer to the [High-Performance Service Deployment solution](https://github.com/PaddlePaddle/PaddleOCR/blob/main/deploy/paddleocr_vl_docker/hps/README_en.md). -4. **Want to fine-tune PaddleOCR-VL to adapt to specific business needs**: +4. **Model Fine-tuning**: + + If you find that the accuracy of PaddleOCR-VL in specific business scenarios does not meet expectations, please read [5. Model Fine-tuning](#5-model-fine-tuning) or the corresponding chapters in the hardware-specific tutorial. - If you find that the accuracy performance of PaddleOCR-VL in specific business scenarios does not meet expectations, please read [5. Model Fine-tuning](#5-model-fine-tuning) or the corresponding chapter in documentation for other hardware. +Hardware-specific usage tutorials: + +| Hardware Type | Usage Tutorial | +|----------------|----------------| +| x64 CPU | This tutorial (currently supports manual dependency installation only) | +| NVIDIA GPU | - NVIDIA Blackwell architecture GPUs (such as the RTX 50 series): [PaddleOCR-VL NVIDIA Blackwell Architecture GPU Usage Tutorial](./PaddleOCR-VL-NVIDIA-Blackwell.en.md)
      - Other NVIDIA GPUs: this tutorial | +| Kunlunxin XPU | [PaddleOCR-VL Kunlunxin XPU Usage Tutorial](./PaddleOCR-VL-Kunlunxin-XPU.en.md) | +| Hygon DCU | [PaddleOCR-VL Hygon DCU Usage Tutorial](./PaddleOCR-VL-Hygon-DCU.en.md) | +| MetaX GPU | [PaddleOCR-VL MetaX GPU Usage Tutorial](./PaddleOCR-VL-MetaX-GPU.en.md) | +| Iluvatar GPU | [PaddleOCR-VL Iluvatar GPU Usage Tutorial](./PaddleOCR-VL-Iluvatar-GPU.en.md) | +| Huawei Ascend NPU | [PaddleOCR-VL Huawei Ascend NPU Usage Tutorial](./PaddleOCR-VL-Huawei-Ascend-NPU.en.md) | +| Apple Silicon | [PaddleOCR-VL Apple Silicon Usage Tutorial](./PaddleOCR-VL-Apple-Silicon.en.md) | +| AMD GPU | [PaddleOCR-VL AMD GPU Usage Tutorial](./PaddleOCR-VL-AMD-GPU.en.md) | +| Intel Arc GPU | [PaddleOCR-VL Intel Arc GPU Usage Tutorial](./PaddleOCR-VL-Intel-Arc-GPU.en.md) | ## Inference Device Support for PaddleOCR-VL -Currently, PaddleOCR-VL offers six inference methods, with varying levels of support for different inference devices. Please confirm that your inference device meets the requirements in the table below before proceeding with PaddleOCR-VL deployment: +PaddleOCR-VL currently provides multiple inference methods, and the supported inference devices are not exactly the same. Please confirm that your inference device meets the requirements in the table below before deploying PaddleOCR-VL: @@ -74,6 +107,19 @@ Currently, PaddleOCR-VL offers six inference methods, with varying levels of sup + + + + + + + + + + + + + @@ -139,48 +185,99 @@ Currently, PaddleOCR-VL offers six inference methods, with varying levels of sup + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Transformers🚧🚧🚧🚧🚧🚧🚧🚧
      PaddlePaddle + vLLM 🚧 🚧
      Transformers + vLLM🚧🚧🚧🚧🚧--🚧🚧
      Transformers + SGLang🚧🚧🚧🚧🚧--🚧🚧
      Transformers + FastDeploy🚧🚧🚧🚧🚧--🚧🚧
      Transformers + MLX-VLM---------
      Transformers + llama.cpp🚧🚧🚧🚧🚧🚧🚧🚧
      Explanation of Inference Method -"PaddlePaddle" indicates that both the layout detection model and the VLM use the PaddlePaddle framework for inference. This is the default mode for the PaddleOCR CLI and Python API. Other inference method follow the format "Layout Detection Model Inference method + VLM Inference method". For example, "PaddlePaddle + vLLM" means the layout detection model uses PaddlePaddle, while the VLM uses vLLM. +"PaddlePaddle" indicates that both the layout detection model and the VLM use the PaddlePaddle framework for inference. This is the default mode for the PaddleOCR CLI and Python API. "Transformers" indicates that both the layout detection model and the VLM use the Transformers engine for inference. Other inference methods follow the format "Layout Detection Model Inference Method + VLM Inference Method". For example, "PaddlePaddle + vLLM" means that the layout detection model uses PaddlePaddle for inference, while the VLM uses vLLM.
      > TIP: > - When using NVIDIA GPU for inference, ensure that the Compute Capability (CC) and CUDA version meet the requirements: > > - PaddlePaddle: CC ≥ 7.0, CUDA ≥ 11.8 +> > - Transformers: CC ≥ 7.0, CUDA ≥ 11.8 > > - vLLM: CC ≥ 8.0, CUDA ≥ 12.6 > > - SGLang: 8.0 ≤ CC < 12.0, CUDA ≥ 12.6 > > - FastDeploy: 8.0 ≤ CC < 12.0, CUDA ≥ 12.6 > > - Common GPUs with CC ≥ 8 include RTX 30/40/50 series and A10/A100, etc. For more models, refer to [CUDA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) > - vLLM compatibility note: Although vLLM can be launched on NVIDIA GPUs with CC 7.x such as T4/V100, timeout or OOM issues may occur, and its use is not recommended. > - vLLM, SGLang, and FastDeploy cannot run natively on Windows. Please use the Docker images we provide. - -Since different hardware requires different dependencies, if your hardware meets the requirements in the table above, please refer to the following table for the corresponding usage tutorial: - -| Hardware Type | Usage Tutorial | -|----------------|------------------------------------------------------------------------------------------------------------------------------| -| x64 CPU | This tutorial (Dependencies must be installed manually for now) | -| NVIDIA GPU | - NVIDIA Blackwell architecture GPU (e.g., RTX 50 series) refer to [PaddleOCR-VL NVIDIA Blackwell Architecture GPU Usage Tutorial](./PaddleOCR-VL-NVIDIA-Blackwell.en.md)
      - Other NVIDIA GPUs refer to this tutorial | -| Kunlunxin XPU | [PaddleOCR-VL Kunlunxin XPU Usage Tutorial](./PaddleOCR-VL-Kunlunxin-XPU.en.md) | -| Hygon DCU | [PaddleOCR-VL Hygon DCU Usage Tutorial](./PaddleOCR-VL-Hygon-DCU.en.md) | -| MetaX GPU | [PaddleOCR-VL MetaX GPU Usage Tutorial](./PaddleOCR-VL-MetaX-GPU.en.md) | -| Iluvatar GPU | [PaddleOCR-VL Iluvatar GPU Usage Tutorial](./PaddleOCR-VL-Iluvatar-GPU.en.md) | -| Huawei Ascend NPU | [PaddleOCR-VL Huawei Ascend NPU Usage Tutorial](./PaddleOCR-VL-Huawei-Ascend-NPU.en.md) | -| Apple Silicon | [PaddleOCR-VL Apple Silicon Usage Tutorial](./PaddleOCR-VL-Apple-Silicon.en.md) | -| AMD GPU | [PaddleOCR-VL AMD GPU Usage Tutorial](./PaddleOCR-VL-AMD-GPU.en.md) | -| Intel Arc GPU | [PaddleOCR-VL Intel Arc GPU Usage Tutorial](./PaddleOCR-VL-Intel-Arc-GPU.en.md) | - -> TIP: -> For example, if you are using an RTX 50 series GPU that meets the device requirements for both PaddlePaddle and vLLM inference methods, please refer to the [PaddleOCR-VL NVIDIA Blackwell Architecture GPU Usage Tutorial](./PaddleOCR-VL-NVIDIA-Blackwell.en.md) to learn about relevant configurations and usage. +> - Due to dependency conflicts between different libraries, when using mixed inference methods like Transformers + vLLM, it is recommended to deploy the layout detection model and VLM service in different environments. ## 1. Environment Preparation -This section explains how to set up the runtime environment for PaddleOCR-VL. Choose one of the following two methods: +This section explains how to set up the runtime environment for PaddleOCR-VL. This tutorial mainly applies to **x64 CPU** users and **NVIDIA GPU** users other than Blackwell. For other hardware, please refer first to the dedicated tutorials listed above. + +This tutorial provides the following two methods for environment preparation: -- Method 1: Use the official Docker image. +- Method 1: Use the official Docker image (NVIDIA GPU only). -- Method 2: Manually install PaddlePaddle and PaddleOCR. +- Method 2: Manually install the inference engine and PaddleOCR (available for both x64 CPU and NVIDIA GPU). **We strongly recommend using the Docker image to minimize potential environment-related issues.** @@ -214,12 +311,14 @@ docker load -i paddleocr-vl-latest-nvidia-gpu-offline.tar # After that, you can use `docker run` to start the container on the offline machine ``` +The image comes preinstalled with the PaddlePaddle framework and does not include any other inference engines. If you want to use other inference engines, it is recommended to install them manually using Method 2 (it is not recommended to install them in an environment where the PaddlePaddle framework is preinstalled). + > TIP: -> Images with the `latest-xxx` tag correspond to the latest version of PaddleOCR. If you want to use a specific version of the PaddleOCR image, you can replace `latest` in the tag with the desired version number: `paddleocr.`. +> Images with the `latest-xxx` tag correspond to the latest version. If you want to use a specific version of the image, you can replace `latest` in the tag with the desired PaddleOCR version number: `paddleocr.`. > For example: > `ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-vl:paddleocr3.3-nvidia-gpu-offline` -### 1.2 Method 2: Manually Install PaddlePaddle and PaddleOCR +### 1.2 Method 2: Manually Install the Inference Engine and PaddleOCR If you cannot use Docker, you can manually install PaddlePaddle and PaddleOCR. The required Python version is 3.8–3.13. @@ -232,55 +331,84 @@ python -m venv .venv_paddleocr source .venv_paddleocr/bin/activate ``` -Run the following commands to complete the installation: +Please first install the dependencies corresponding to your chosen inference engine: + +- If you use PaddlePaddle for inference, install PaddlePaddle 3.2.1 or later (**do not install both the CPU and GPU versions of PaddlePaddle at the same time**). Common installation commands are as follows: ```shell -# The following command installs the PaddlePaddle version for CUDA 12.6. For other CUDA versions and the CPU version, please refer to https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html +# NVIDIA GPU (CUDA 12.6 as an example) python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ -python -m pip install -U "paddleocr[doc-parser]" + +# x64 CPU +python -m pip install paddlepaddle==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ ``` -> IMPORTANT: -> **Please ensure that you install PaddlePaddle framework version 3.2.1 or above.** + For other CUDA versions, please refer to the PaddlePaddle installation guide: + +- If you use `transformers` for inference, refer to the [official Transformers documentation](https://huggingface.co/docs/transformers/installation) to install `transformers` and the required low-level inference framework dependencies. + +After installing the inference engine, run the following command to install the base package required by PaddleOCR-VL: + +```shell +python -m pip install -U "paddleocr[doc-parser]" +``` ## 2. Quick Start -PaddleOCR-VL supports two usage methods: CLI command line and Python API. The CLI command line method is simpler and suitable for quickly verifying functionality, while the Python API method is more flexible and suitable for integration into existing projects. +This section introduces how to use PaddleOCR-VL through the CLI and Python API. + +PaddleOCR-VL supports both CLI and Python API usage. The CLI method is simpler and suitable for quick verification, while the Python API is more flexible and suitable for integration into existing projects. The examples below use PaddlePaddle inference by default. To switch to the `transformers` engine, append `--engine transformers` in the CLI, or pass `engine="transformers"` when initializing the Python API. > IMPORTANT: -> The methods introduced in this section are primarily for rapid validation. Their inference speed, memory usage, and stability may not meet the requirements of a production environment. **If deployment to a production environment is needed, we strongly recommend using a dedicated inference acceleration framework**. For specific methods, please refer to the next section. +> The methods introduced in this section are primarily for rapid validation. Their inference speed, memory usage, and stability may not meet the requirements of a production environment. **If deployment to a production environment is needed, we strongly recommend using a dedicated VLM inference service**. For specific methods, please refer to the next section. ### 2.1 Command Line Usage -Run a single command to quickly test the PaddleOCR-VL : +When you run PaddleOCR-VL for the first time, it will automatically download the official model files. Please make sure the current environment has internet access and allow some extra time for downloading and initialization. + +If you would like to use the local demo image from this document directly, you can download it first: + +```shell +curl -L -o paddleocr_vl_demo.png https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png +``` + +The following are ready-to-copy example commands. For the first try, we recommend adding `--save_path ./output` so that you can inspect the saved results in the current directory: ```shell # NVIDIA GPU -paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png +paddleocr doc_parser -i ./paddleocr_vl_demo.png --save_path ./output # Kunlunxin XPU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device xpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device xpu --save_path ./output # Hygon DCU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device dcu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device dcu --save_path ./output # MetaX GPU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device metax_gpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device metax_gpu --save_path ./output # Apple Silicon -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device cpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device cpu --save_path ./output -# Huawei Ascend NPU -# Huawei Ascend NPU please refer to Chapter 3 for inference using PaddlePaddle + vLLM +# Huawei Ascend NPU +# For Huawei Ascend NPU, please refer to Chapter 3 and use PaddlePaddle + vLLM for inference # Use --use_doc_orientation_classify to enable document orientation classification -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_orientation_classify True +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_orientation_classify True --save_path ./output -# Use --use_doc_unwarping to enable document unwarping module -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_unwarping True +# Use --use_doc_unwarping to enable the document unwarping module +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_unwarping True --save_path ./output -# Use --use_layout_detection to enable layout detection -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False +# Use --use_layout_detection to disable the layout detection and ordering module +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False --save_path ./output +``` + +After successful execution, the terminal will print the structured result. If you set `--save_path ./output`, the result files will also be saved under the `output` directory in the current working directory for further inspection and debugging. + +To switch to the `transformers` engine, use: + +```bash +paddleocr doc_parser -i ./paddleocr_vl_demo.png --engine transformers --save_path ./output ```
      Command line supports more parameters. Click to expand for detailed parameter descriptions @@ -612,57 +740,69 @@ Supports specifying specific card numbers:
        +engine +Meaning: Inference engine.
        Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool - +None use_tensorrt -Meaning:Whether to enable the TensorRT subgraph engine of Paddle Inference. -If the model does not support acceleration via TensorRT, acceleration will not be used even if this flag is set.
        For PaddlePaddle version with CUDA 11.8, the compatible TensorRT version is 8.x (x&gt;=6). It is recommended to install TensorRT 8.6.1.6.
        +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
        +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
        +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
        bool - +False precision -Meaning:Computational precision, such as fp32, fp16. +Meaning: Computation precision, such as fp32 or fp16. str - +fp32 enable_mkldnn -Meaning:Whether to enable MKL-DNN accelerated inference.
        +Meaning: Whether to enable MKL-DNN accelerated inference.
        Description: -If MKL-DNN is not available or the model does not support acceleration via MKL-DNN, acceleration will not be used even if this flag is set. +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool - +True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. + +Meaning: MKL-DNN cache capacity. + int - +10 cpu_threads -Meaning:The number of threads used for inference on the CPU. +Meaning: Number of threads used for inference on CPU. int - +10 paddlex_config -Meaning:The file path for PaddleX production line configuration. +Meaning: Path to the PaddleX pipeline configuration file. str +

      -The inference result will be printed in the terminal. The default output of the PP-StructureV3 pipeline is as follows: +The inference result will be printed in the terminal. The default output of PaddleOCR-VL is as follows:
      👉Click to expand
      @@ -670,17 +810,22 @@ The inference result will be printed in the terminal. The default output of the
       {'res': {'input_path': 'paddleocr_vl_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_chart_recognition': False, 'format_block_content': False}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 6, 'label': 'doc_title', 'score': 0.9636914134025574, 'coordinate': [np.float32(131.31366), np.float32(36.450516), np.float32(1384.522), np.float32(127.984665)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9281806349754333, 'coordinate': [np.float32(585.39465), np.float32(158.438), np.float32(930.2184), np.float32(182.57469)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840355515480042, 'coordinate': [np.float32(9.023666), np.float32(200.86115), np.float32(361.41583), np.float32(343.8828)]}, {'cls_id': 14, 'label': 'image', 'score': 0.9871416091918945, 'coordinate': [np.float32(775.50574), np.float32(200.66502), np.float32(1503.3807), np.float32(684.9304)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9801855087280273, 'coordinate': [np.float32(9.532196), np.float32(344.90594), np.float32(361.4413), np.float32(440.8244)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9708921313285828, 'coordinate': [np.float32(28.040405), np.float32(455.87976), np.float32(341.7215), np.float32(520.7117)]}, {'cls_id': 24, 'label': 'vision_footnote', 'score': 0.9002962708473206, 'coordinate': [np.float32(809.0692), np.float32(703.70044), np.float32(1488.3016), np.float32(750.5238)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825374484062195, 'coordinate': [np.float32(8.896561), np.float32(536.54895), np.float32(361.05237), np.float32(655.8058)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822263717651367, 'coordinate': [np.float32(8.971573), np.float32(657.4949), np.float32(362.01715), np.float32(774.625)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9767460823059082, 'coordinate': [np.float32(9.407074), np.float32(776.5216), np.float32(361.31067), np.float32(846.82874)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9868153929710388, 'coordinate': [np.float32(8.669495), np.float32(848.2543), np.float32(361.64703), np.float32(1062.8568)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9826608300209045, 'coordinate': [np.float32(8.8025055), np.float32(1063.8615), np.float32(361.46588), np.float32(1182.8524)]}, {'cls_id': 22, 'label': 'text', 'score': 0.982555627822876, 'coordinate': [np.float32(8.820602), np.float32(1184.4663), np.float32(361.66394), np.float32(1302.4507)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9584776759147644, 'coordinate': [np.float32(9.170288), np.float32(1304.2161), np.float32(361.48898), np.float32(1351.7483)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9782056212425232, 'coordinate': [np.float32(389.1618), np.float32(200.38202), np.float32(742.7591), np.float32(295.65146)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9844875931739807, 'coordinate': [np.float32(388.73303), np.float32(297.18463), np.float32(744.00024), np.float32(441.3034)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9680547714233398, 'coordinate': [np.float32(409.39468), np.float32(455.89386), np.float32(721.7174), np.float32(520.9387)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9741666913032532, 'coordinate': [np.float32(389.71606), np.float32(536.8138), np.float32(742.7112), np.float32(608.00165)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840384721755981, 'coordinate': [np.float32(389.30988), np.float32(609.39636), np.float32(743.09247), np.float32(750.3231)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9845995306968689, 'coordinate': [np.float32(389.13272), np.float32(751.7772), np.float32(743.058), np.float32(894.8815)]}, {'cls_id': 22, 'label': 'text', 'score': 0.984852135181427, 'coordinate': [np.float32(388.83267), np.float32(896.0371), np.float32(743.58215), np.float32(1038.7345)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9804865717887878, 'coordinate': [np.float32(389.08478), np.float32(1039.9119), np.float32(742.7585), np.float32(1134.4897)]}, {'cls_id': 22, 'label': 'text', 'score': 0.986461341381073, 'coordinate': [np.float32(388.52643), np.float32(1135.8137), np.float32(743.451), np.float32(1352.0085)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9869391918182373, 'coordinate': [np.float32(769.8341), np.float32(775.66235), np.float32(1124.9813), np.float32(1063.207)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822869896888733, 'coordinate': [np.float32(770.30383), np.float32(1063.938), np.float32(1124.8295), np.float32(1184.2192)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9689218997955322, 'coordinate': [np.float32(791.3042), np.float32(1199.3169), np.float32(1104.4521), np.float32(1264.6985)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9713128209114075, 'coordinate': [np.float32(770.4253), np.float32(1279.6072), np.float32(1124.6917), np.float32(1351.8672)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9236552119255066, 'coordinate': [np.float32(1153.9058), np.float32(775.5814), np.float32(1334.0654), np.float32(798.1581)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9857938885688782, 'coordinate': [np.float32(1151.5197), np.float32(799.28015), np.float32(1506.3619), np.float32(991.1156)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9820687174797058, 'coordinate': [np.float32(1151.5686), np.float32(991.91095), np.float32(1506.6023), np.float32(1110.8875)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9866049885749817, 'coordinate': [np.float32(1151.6919), np.float32(1112.1301), np.float32(1507.1611), np.float32(1351.9504)]}]}}}
       
      -For explanation of the result parameters, refer to [2.2 Python Script Integration](#22-python-script-integration). +For detailed descriptions of the running results and saving interfaces, refer to the result explanation in [2.2 Python Script Integration](#22-python-script-integration). -Note: The default model for the pipeline is relatively large, which may result in slower inference speed. It is recommended to use [inference acceleration frameworks to enhance VLM inference performance](#31-launching-the-vlm-inference-service) for faster inference. +Note: Since the default model of PaddleOCR-VL is relatively large, inference may be slow. For actual use, it is recommended to use [3. Improving Inference Performance with VLM Inference Services](#3-vlm) for faster inference. ### 2.2 Python Script Integration -The command line method is for quick testing and visualization. In actual projects, you usually need to integrate the model via code. You can perform pipeline inference with just a few lines of code as shown below: +The command line method is intended for quick testing and visualization. In real projects, you usually integrate the model through code. You can quickly run PaddleOCR-VL inference with just a few lines of code: ```python +from pathlib import Path + from paddleocr import PaddleOCRVL +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) + # NVIDIA GPU pipeline = PaddleOCRVL() # Kunlunxin XPU @@ -701,17 +846,40 @@ pipeline = PaddleOCRVL() output = pipeline.predict("./paddleocr_vl_demo.png") for res in output: res.print() ## Print the structured prediction output - res.save_to_json(save_path="output") ## Save the current image's structured result in JSON format - res.save_to_markdown(save_path="output") ## Save the current image's result in Markdown format + res.save_to_json(save_path=output_dir) ## Save the current image's structured result in JSON format + res.save_to_markdown(save_path=output_dir) ## Save the current image's result in Markdown format + res.save_to_word(save_path="output") ## Save the current image's result in Word format +``` + +To switch to the `transformers` engine, use: + +```python +from pathlib import Path + +from paddleocr import PaddleOCRVL + +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) + +pipeline = PaddleOCRVL(engine="transformers") +output = pipeline.predict("./paddleocr_vl_demo.png") +for res in output: + res.print() ## Print the structured prediction output + res.save_to_json(save_path=output_dir) ## Save the current image's structured result in JSON format + res.save_to_markdown(save_path=output_dir) ## Save the current image's result in Markdown format res.save_to_word(save_path="output") ## Save the current image's result in Word format ``` For PDF files, each page will be processed individually, and a separate Markdown file will be generated for each page. If you wish to perform cross-page table merging, reconstruct multi-level headings, or merge multi-page results, you can achieve this using the following method: ```python +from pathlib import Path + from paddleocr import PaddleOCRVL input_file = "./your_pdf_file.pdf" +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) pipeline = PaddleOCRVL() @@ -723,10 +891,11 @@ output = pipeline.restructure_pages(pages_res) # output = pipeline.restructure_pages(pages_res, merge_tables=True) # Merge tables across pages # output = pipeline.restructure_pages(pages_res, merge_tables=True, relevel_titles=True) # Merge tables across pages and reconstruct multi-level titles # output = pipeline.restructure_pages(pages_res, merge_tables=True, relevel_titles=True, concatenate_pages=True) # Merge tables across pages, reconstruct multi-level titles, and merge multiple pages + for res in output: res.print() ## Print the structured prediction output - res.save_to_json(save_path="output") ## Save the current image's structured result in JSON format - res.save_to_markdown(save_path="output") ## Save the current image's result in Markdown format + res.save_to_json(save_path=output_dir) ## Save the current image's structured result in JSON format + res.save_to_markdown(save_path=output_dir) ## Save the current image's result in Markdown format ``` If you need to process multiple files, **it is recommended to pass the directory path containing the files or a list of file paths to the `predict` method** to maximize processing efficiency. For example: @@ -744,6 +913,8 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] **Note:** +- In the example code above, the `use_doc_orientation_classify` and `use_doc_unwarping` parameters are both set to `False` by default, meaning document orientation classification and document unwarping are disabled. If you need these features, set them to `True` manually. + The above Python script performs the following steps: @@ -999,53 +1170,69 @@ Supports specifying specific card numbers:
        str|None None + +engine +Meaning: Inference engine.
        Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
        Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to enable the TensorRT subgraph engine of Paddle Inference.
        -Description: -If the model does not support acceleration via TensorRT, acceleration will not be used even if this flag is set.
        For PaddlePaddle version with CUDA 11.8, the compatible TensorRT version is 8.x (x&gt;=6). It is recommended to install TensorRT 8.6.1.6.
        +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
        +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
        +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
        bool False precision -Meaning:Computational precision, such as fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN accelerated inference.
        -Description: -If MKL-DNN is not available or the model does not support acceleration via MKL-DNN, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
        +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. + +Meaning: MKL-DNN cache capacity. + int 10 cpu_threads -Meaning:The number of threads used for inference on the CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:The file path for PaddleX pipeline configuration. +Meaning: Path to the PaddleX pipeline configuration file. str None - @@ -1062,7 +1249,6 @@ If MKL-DNN is not available or the model does not support acceleration via MKL-D - input Meaning:Data to be predicted, supporting multiple input types. Required.
        Description: @@ -1195,6 +1381,7 @@ Setting it to None means using the instantiation parameter; otherwi str|None None + format_block_content Meaning:The parameter meaning is basically the same as the instantiation parameter.
        @@ -1393,7 +1580,6 @@ Setting it to None means using the instantiation parameter; otherwi False - save_to_html() Save the tables in the file as html format files save_path @@ -1498,18 +1684,12 @@ Setting it to None means using the instantiation parameter; otherwi Obtain the prediction jsonresult in the format - img -obtain in the format of dictvisualized image - - - - - markdown -obtain in the format of dictmarkdown result - - +img +Obtain visualized images in dict format +markdown +Obtain markdown results in dict format @@ -1522,12 +1702,16 @@ Setting it to None means using the instantiation parameter; otherwi
      -## 3. Enhancing VLM Inference Performance Using Inference Acceleration Frameworks + +## 3. Improving Inference Performance with VLM Inference Services -The inference performance under default configurations is not fully optimized and may not meet actual production requirements. This step primarily introduces how to use the vLLM, SGLang and FastDeploy inference acceleration frameworks to enhance the inference performance of PaddleOCR-VL. +Using only PaddlePaddle or Transformers usually does not provide optimal inference performance. This section mainly introduces how to improve PaddleOCR-VL inference performance through VLM inference services. You can either deploy your own VLM inference service based on backends such as vLLM, SGLang, FastDeploy, MLX-VLM, and llama.cpp, or directly use compatible managed services. This section corresponds to combinations of "Layout Detection Inference Method + VLM Inference Service". Its core idea is that **the client continues to handle the other stages in the full workflow, such as layout detection, while only the VLM stage is delegated to a dedicated service**. ### 3.1 Launching the VLM Inference Service +> IMPORTANT: +> The services launched according to this section are responsible only for the VLM inference stage in the PaddleOCR-VL workflow and do not provide a complete end-to-end document parsing API. It is strongly discouraged to directly call such services through plain HTTP requests or OpenAI clients to process document images. If you need to deploy a service with the full PaddleOCR-VL capability, please refer to the service deployment section later in this document. + There are three methods to launch the VLM inference service; choose either one: - Method 1: Launch the service using the official Docker image. Currently supported: @@ -1547,7 +1731,7 @@ There are three methods to launch the VLM inference service; choose either one: **We strongly recommend using the Docker image to minimize potential environment-related issues.** -In addition, cloud platforms such as [SiliconFlow](https://siliconflow.cn/) and [Novita AI](https://novita.ai/models-console/model-detail/paddlepaddle-paddleocr-vl) also provide managed services. If you choose to use such services, you can skip this section and directly read [3.2 Client Usage Methods](#32-client-usage-methods). +In addition, cloud platforms such as [SiliconFlow](https://siliconflow.cn/) and [Novita AI](https://novita.ai/models-console/model-detail/paddlepaddle-paddleocr-vl) also provide managed services. If you choose to use such services, you can skip this subsection and directly read [3.2 Client Usage Methods](#32-client-usage-methods). #### 3.1.1 Method 1: Using Docker Image @@ -1589,11 +1773,13 @@ docker run \ --rm \ --gpus all \ --network host \ - -v vllm_config.yml:/tmp/vllm_config.yml \ + -v vllm_config.yml:/tmp/vllm_config.yml \ ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-nvidia-gpu \ paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8118 --backend vllm --backend_config /tmp/vllm_config.yml ``` +Here, `vllm_config.yml` refers to a local configuration file path on the host machine. The example assumes that you created this file in the current working directory. If the file is located elsewhere, replace it with the actual absolute or relative path. + > TIP: > Images with the `latest-xxx` tag correspond to the latest version of PaddleOCR. If you want to use a specific version of the PaddleOCR image, you can replace `latest` in the tag with the desired version number: `paddleocr.`. > For example: @@ -1603,7 +1789,7 @@ docker run \ **The PaddleOCR CLI has already resolved complex version compatibility issues. Instead of spending time studying framework documentation, you can install the necessary environment with a single command.** -Due to potential dependency conflicts between inference acceleration frameworks and PaddlePaddle, it is recommended to install them in a virtual environment: +Since inference acceleration frameworks may conflict with packages already installed in the current environment, it is recommended to install them in a virtual environment: ```shell # If a virtual environment is currently activated, deactivate it first using `deactivate` @@ -1624,6 +1810,19 @@ python -m pip install "paddleocr[doc-parser]" paddleocr install_genai_server_deps vllm ``` +The usage of `paddleocr install_genai_server_deps` is: + +```shell +paddleocr install_genai_server_deps +``` + +Currently supported framework names are `vllm`, `sglang`, and `fastdeploy`, corresponding to vLLM, SGLang, and FastDeploy, respectively. + +Both vLLM and SGLang installed through `paddleocr install_genai_server_deps` are **CUDA 12.6** versions. Please ensure that your local NVIDIA driver supports this version or a later one. + +> WARNING: +> The transformers library versions required by vLLM, SGLang and Transformers engine are currently incompatible, so Transformers engine and vLLM cannot be installed together with vLLM or SGLang in the same environment. If using Transformers + vLLM or Transformers + SGLang inference, please deploy the layout detection model and VLM service in different environments. + After installation, you can launch the service using the `paddleocr genai_server` command: ```shell @@ -1638,7 +1837,7 @@ The parameters supported by this command are as follows: | `--model_dir` | Model directory | | `--host` | Server hostname | | `--port` | Server port number | -| `--backend` | Backend name, i.e., the name of the inference acceleration framework used; options are `vllm` or `sglang` | +| `--backend` | Backend name, i.e., the name of the inference acceleration framework used; options are `vllm`, `sglang`, or `fastdeploy` | | `--backend_config` | Can specify a YAML file containing backend configurations | #### 3.1.3 Launch Service Directly Using Inference Acceleration Frameworks @@ -1664,7 +1863,7 @@ The parameters supported by this command are as follows: ### 3.2 Client Usage Methods -After launching the VLM inference service, the client can call the service through PaddleOCR. **Please note that because the client needs to call the layout detection model, it is still recommended to run the client on GPU or other acceleration devices to achieve more stable and efficient performance. Please refer to Section 1 for the client-side environment configuration. The configuration described in Section 3.1 applies only to starting the service and is not applicable to the client.** +After launching the VLM inference service, the client can call the service through PaddleOCR. This section applies both to self-hosted VLM inference services launched in 3.1 and to compatible managed services provided by third parties. **Please note that because the client still needs to call the layout detection model and complete the other stages in the workflow, it is still recommended to run the client on GPU or other acceleration devices to achieve more stable and efficient performance. Please refer to Section 1 for the client-side environment configuration. The configuration described in Section 3.1 applies only to starting the service and is not applicable to the client. If you want the client to invoke the full PaddleOCR-VL capability only through an HTTP interface, please directly refer to Section 4, "Service Deployment".** #### 3.2.1 CLI Invocation @@ -1754,7 +1953,7 @@ pipeline = PaddleOCRVL( ### 3.3 Performance Tuning -The default configuration cannot guarantee optimal performance in all environments. If users encounter performance issues in actual use, the following optimization methods can be attempted. +The default configurations cannot guarantee optimal performance in all environments. If you encounter performance issues in actual use, you can try the following optimization methods. #### 3.3.1 Server-Side Parameter Adjustment @@ -1908,7 +2107,8 @@ After generating the configuration file, add the following paddleocr-vlm-s ```yaml paddleocr-vlm-server: ... - volumes: /path/to/your_config.yaml:/home/paddleocr/vlm_server_config.yaml + volumes: + - /path/to/your_config.yaml:/home/paddleocr/vlm_server_config.yaml command: paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8118 --backend vllm --backend_config /home/paddleocr/vlm_server_config.yaml ... ``` @@ -1940,6 +2140,8 @@ Refer to section 4. Execute the following command to install the service deployment plugin via the PaddleX CLI: +> The `paddlex` command is installed together with `paddleocr`. Therefore, if you have already installed PaddleOCR in the previous steps, you usually do not need to install PaddleX separately. + ```shell paddlex --install serving ``` @@ -1950,6 +2152,12 @@ Then, start the server using the PaddleX CLI: paddlex --serve --pipeline PaddleOCR-VL ``` +To switch to the `transformers` engine for service deployment, use: + +```shell +paddlex --serve --pipeline PaddleOCR-VL --engine transformers +``` + After startup, you will see output similar to the following, with the server listening on port **8080** by default: ```text @@ -2264,6 +2472,12 @@ Below are the API reference and examples of multi-language service invocation: No +outputFormats +array | null +Optional. List of extra document formats to return. By default, no extra formats are returned. Currently only "docx" is supported. +No + + visualize boolean|null Whether to return visualization result images and intermediate images during the processing.
        @@ -2302,13 +2516,13 @@ Below are the API reference and examples of multi-language service invocation: -

        Each element inlayoutParsingResults is an object with the following attributes:

        +

        Each element in layoutParsingResults is an object with the following attributes:

        - + @@ -2332,9 +2546,14 @@ Below are the API reference and examples of multi-language service invocation: + + + + +
        Meaning Name TypeMeaning
        string|null Input image. The image is in JPEG format and encoded using Base64.
        exportsobject | nullOptional additional exports. Present only when outputFormats is set. Example: {"docx": {"content": "..."}}, where content is the Base64-encoded file content.
        -

        markdownis an objectwith the following properties:

        +

        markdown is an object with the following properties:

        @@ -2412,6 +2631,12 @@ Below are the API reference and examples of multi-language service invocation: + + + + + +
        Whether to include formula numbers in the output Markdown text. The default is false. No
        outputFormatsarray | nullOptional extra export formats; same meaning as outputFormats on infer. Only "docx" is supported.No
        diff --git a/docs/version3.x/pipeline_usage/PaddleOCR-VL.md b/docs/version3.x/pipeline_usage/PaddleOCR-VL.md index 4459ea7fa41..865dd46d363 100644 --- a/docs/version3.x/pipeline_usage/PaddleOCR-VL.md +++ b/docs/version3.x/pipeline_usage/PaddleOCR-VL.md @@ -4,45 +4,77 @@ comments: true # PaddleOCR-VL 使用教程 -PaddleOCR-VL 是一款先进、高效的文档解析模型,专为文档中的元素识别设计。其核心组件为 PaddleOCR-VL-0.9B,这是一种紧凑而强大的视觉语言模型(VLM),它由 NaViT 风格的动态分辨率视觉编码器与 ERNIE-4.5-0.3B 语言模型组成,能够实现精准的元素识别。该模型支持 109 种语言,并在识别复杂元素(如文本、表格、公式和图表)方面表现出色,同时保持极低的资源消耗。通过在广泛使用的公开基准与内部基准上的全面评测,PaddleOCR-VL 在页级级文档解析与元素级识别均达到 SOTA 表现。它显著优于现有的基于Pipeline方案和文档解析多模态方案以及先进的通用多模态大模型,并具备更快的推理速度。这些优势使其非常适合在真实场景中落地部署。 +> INFO: +> PaddleOCR 为 PaddleOCR-VL 系列模型提供了统一的接口,方便用户快速上手和使用。除非另有说明,本教程及相关硬件使用教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 + +PaddleOCR-VL 是一款先进、高效的文档解析模型,专为文档中的元素识别设计。以其初代版本(PaddleOCR-VL v1)为例,其核心组件为 PaddleOCR-VL-0.9B,这是一种紧凑而强大的视觉语言模型(VLM),它由 NaViT 风格的动态分辨率视觉编码器与 ERNIE-4.5-0.3B 语言模型组成,能够实现精准的元素识别。该系列模型支持 109 种语言,并在识别复杂元素(如文本、表格、公式和图表)方面表现出色,同时保持极低的资源消耗。通过在广泛使用的公开基准与内部基准上的全面评测,PaddleOCR-VL 在页级文档解析与元素级识别均达到 SOTA 表现。它显著优于现有的基于Pipeline方案和文档解析多模态方案以及先进的通用多模态大模型,并具备更快的推理速度。这些优势使其非常适合在真实场景中落地部署。 **2026年1月29日,我们发布了PaddleOCR-VL-1.5。PaddleOCR-VL-1.5不仅以94.5%精度大幅刷新了评测集OmniDocBench v1.5,更创新性地支持了异形框定位,使得PaddleOCR-VL-1.5 在扫描、倾斜、弯折、屏幕拍摄及复杂光照等真实场景中均表现优异。此外,模型还新增了印章识别与文本检测识别能力,关键指标持续领跑。** - +## 流程导览 {#流程导览} -> INFO: -> PaddleOCR 为 PaddleOCR-VL 系列模型提供了统一的接口,方便用户快速上手和使用。除非另有说明,在随后的章节及相关使用教程中提到的 “PaddleOCR-VL” 均指 PaddleOCR-VL 系列模型(如 PaddleOCR-VL-1.5 等);若特指 PaddleOCR-VL v1 版本,将另行明确标注。 +您可以先根据自己的目标选择阅读路径,再结合硬件类型确认应阅读本教程,还是对应硬件教程中的相同章节。 + +在开始前,建议先确认您的设备类型: + +- **x64 CPU**:可直接阅读本教程。 +- **英伟达 GPU**: + - 如果是 **RTX 50 系等 Blackwell 架构 GPU**,建议先继续阅读本节流程导览,确定使用目标;随后再参考 [PaddleOCR-VL NVIDIA Blackwell 架构 GPU 使用教程](./PaddleOCR-VL-NVIDIA-Blackwell.md) 中对应的章节。 + - 其他英伟达 GPU 可直接阅读本教程。 +- **Apple Silicon、昆仑芯 XPU、海光 DCU、沐曦 GPU、天数 GPU、华为昇腾 NPU**:建议先继续阅读本节流程导览,确定使用目标;随后再参考对应硬件教程中的相同章节。 + +在按照上述路径直接阅读后续章节之前,如果您需要确认 PaddleOCR-VL 在当前硬件环境下支持哪些推理方式(例如使用 PaddlePaddle 框架作为推理引擎),请继续阅读下一节“PaddleOCR-VL 对推理设备的支持情况”。 + +确认上述信息后,再按使用目标选择路径: + +1. **本地直接推理(快速体验 / 脚本集成)**: -## 流程导览 + 适用于在本机通过 PaddleOCR CLI 或 Python API 直接调用 PaddleOCR-VL。 + 这一类通常对应本地推理引擎方式,如 PaddlePaddle 或 Transformers。 -在开始之前,请参考下一节了解 PaddleOCR-VL 对推理设备的支持情况,**以确定您的设备是否满足运行要求。** 若您的设备满足运行要求,请根据您的需求选择相关章节阅读。 + 请阅读 [1. 环境准备](#1) 和 [2. 快速开始](#2),或其他硬件文档中的对应章节。 -部分推理硬件可能需要参考我们提供的其他使用教程,但流程是一样的,不影响您的阅读下面的流程导览: +2. **客户端结合 VLM 推理服务(性能优先)**: -1. **希望快速体验 PaddleOCR-VL**: - - 如果您希望快速体验 PaddleOCR-VL 的推理效果,请阅读 [1. 环境准备](#1) 和 [2. 快速开始](#2),或其他硬件文档中的对应章节。 + 适用于将 VLM 环节交给专用推理服务处理,以提升性能。您既可以自建基于 `vLLM`、`SGLang`、`FastDeploy`、`MLX-VLM`、`llama.cpp` 等后端的 VLM 推理服务,也可以直接使用兼容的托管服务。 + 这一类通常对应“版面检测推理方式 + VLM 推理服务”的组合。 -2. **希望将 PaddleOCR-VL 用于生产环境**: - - 快速体验虽然可以让您感受到 PaddleOCR-VL 的效果,但在推理速度、显存占用等方面不是最佳状态。如果您希望将 PaddleOCR-VL 应用于生产环境,并且对推理性能有更高的要求,请阅读 [3. 使用推理加速框架提升 VLM 推理性能](#3-vlm) 或其他硬件文档中的对应章节。 + 建议先按上一条完成本地直接推理的基本跑通,再继续阅读 [3. 使用 VLM 推理服务提升推理性能](#3-vlm) 或其他硬件文档中的对应章节。 -3. **希望将 PaddleOCR-VL 部署为 API 服务**: + 需要特别注意的是,**第 3 节启动的是 VLM 推理服务,而不是 PaddleOCR-VL 的完整 API 服务**;版面检测等其他环节仍在客户端执行。 - 您可以将 PaddleOCR-VL 部署为一个网络服务(API),这样客户端应用程序无需配置环境,仅通过一个特定的网址就可以调用 PaddleOCR-VL 的能力。在对并发处理能力没有特别要求的情况下,可以选择以下两种方案之一: +3. **部署完整 API 服务**: - - 使用 Docker Compose 部署(一键启动,推荐使用):请阅读 [4.1 方法一:使用 Docker Compose 部署](#41-docker-compose) 和 [4.3 客户端调用方式](#43),或其他硬件文档中的对应章节。 - - 手动部署:请阅读 [1. 环境准备](#1)、 [4.2 方法二:手动部署](#42) 和 [4.3 客户端调用方式](#43),或其他硬件文档中的对应章节。 + 适用于将 PaddleOCR-VL 的完整能力封装为网络服务,客户端仅通过 HTTP 接口即可完成调用。与上一条不同,这里部署的是可直接对外提供完整 PaddleOCR-VL 能力的 API 服务,而不是仅负责 VLM 推理的后端服务。在对并发处理能力没有特别要求的情况下,可以选择以下两种方案之一: + + - 使用 Docker Compose 部署(一键启动,推荐使用):采用“PaddlePaddle + VLM 推理服务”的推理方式,底层 VLM 服务使用推理加速框架。请阅读 [4.1 方法一:使用 Docker Compose 部署](#41-docker-compose) 和 [4.3 客户端调用方式](#43),或其他硬件文档中的对应章节。 + - 手动部署:默认采用 PaddlePaddle 推理方式,也可切换到 Transformers,或者通过配置 VLM 推理服务实现“版面检测推理方式 + VLM 推理服务”组合。请阅读 [1. 环境准备](#1)、[4.2 方法二:手动部署](#42) 和 [4.3 客户端调用方式](#43),或其他硬件文档中的对应章节。 如需支持并发请求处理,请参考[高性能服务化部署方案](https://github.com/PaddlePaddle/PaddleOCR/blob/main/deploy/paddleocr_vl_docker/hps/README.md)。 -4. **希望对 PaddleOCR-VL 进行微调以适配特定业务**: +4. **模型微调**: 如果您发现 PaddleOCR-VL 在特定业务场景中的精度表现未达预期,请阅读 [5. 模型微调](#5) 或其他硬件文档中的对应章节。 + +各硬件对应的使用教程: + +| 硬件类型 | 使用教程 | +|-----------------|--------------------------------------------------| +| x64 CPU | 本教程(当前仅支持手动安装依赖) | +| 英伟达 GPU | - NVIDIA Blackwell 架构 GPU(如RTX 50 系)参考 [PaddleOCR-VL NVIDIA Blackwell 架构 GPU 使用教程](./PaddleOCR-VL-NVIDIA-Blackwell.md)
        - 其他英伟达 GPU 参考本教程 | +| 昆仑芯 XPU | [PaddleOCR-VL 昆仑芯 XPU 使用教程](./PaddleOCR-VL-Kunlunxin-XPU.md) | +| 海光 DCU | [PaddleOCR-VL 海光 DCU 使用教程](./PaddleOCR-VL-Hygon-DCU.md) | +| 沐曦 GPU | [PaddleOCR-VL 沐曦 GPU 使用教程](./PaddleOCR-VL-MetaX-GPU.md) | +| 天数 GPU | [PaddleOCR-VL 天数 GPU 使用教程](./PaddleOCR-VL-Iluvatar-GPU.md) | +| 华为昇腾 NPU | [PaddleOCR-VL 华为昇腾 NPU 使用教程](./PaddleOCR-VL-Huawei-Ascend-NPU.md) | +| Apple Silicon | [PaddleOCR-VL Apple Silicon 使用教程](./PaddleOCR-VL-Apple-Silicon.md) | +| AMD GPU | [PaddleOCR-VL AMD GPU 使用教程](./PaddleOCR-VL-AMD-GPU.md) | +| Intel Arc GPU | [PaddleOCR-VL Intel Arc GPU 使用教程](./PaddleOCR-VL-Intel-Arc-GPU.md) | + ## PaddleOCR-VL 对推理设备的支持情况 -目前 PaddleOCR-VL 有六种推理方式,支持的推理设备不完全相同,请确认您的推理设备是否满足下表要求再进行 PaddleOCR-VL 的推理部署: +目前 PaddleOCR-VL 提供多种推理方式,支持的推理设备不完全相同,请确认您的推理设备是否满足下表要求再进行 PaddleOCR-VL 的推理部署: @@ -74,6 +106,19 @@ PaddleOCR-VL 是一款先进、高效的文档解析模型,专为文档中的 + + + + + + + + + + + + + @@ -139,48 +184,99 @@ PaddleOCR-VL 是一款先进、高效的文档解析模型,专为文档中的 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        Transformers🚧🚧🚧🚧🚧🚧🚧🚧
        PaddlePaddle + vLLM 🚧 🚧
        Transformers + vLLM🚧🚧🚧🚧🚧--🚧🚧
        Transformers + SGLang🚧🚧🚧🚧🚧--🚧🚧
        Transformers + FastDeploy🚧🚧🚧🚧🚧--🚧🚧
        Transformers + MLX-VLM---------
        Transformers + llama.cpp🚧🚧🚧🚧🚧🚧🚧🚧
        推理方式说明 -“PaddlePaddle” 表示版面检测模型与 VLM 均使用飞桨框架推理,PaddleOCR CLI 与 Python API 默认使用这种推理方式;其余推理方式遵循 “版面检测模型推理方式 + VLM 推理方式” 的格式,如“PaddlePaddle + vLLM”是指版面检测模型使用 PaddlePaddle 推理,VLM 使用 vLLM 推理。 +“PaddlePaddle” 表示版面检测模型与 VLM 均使用飞桨框架推理,PaddleOCR CLI 与 Python API 默认使用这种推理方式;“Transformers” 表示版面检测模型与 VLM 均通过 Transformers 引擎推理;其余推理方式遵循 “版面检测模型推理方式 + VLM 推理方式” 的格式,如“PaddlePaddle + vLLM”是指版面检测模型使用 PaddlePaddle 推理,VLM 使用 vLLM 推理。
        > TIP: > - 使用英伟达 GPU 推理时需要注意 Compute Capability(简称 CC) 和 CUDA 版本(简称 CUDA)是否满足要求: -> > - PaddlePaddle: CC ≥ 7.0, CUDA ≥ 11.8 -> > - vLLM: CC ≥ 8.0, CUDA ≥ 12.6 -> > - SGLang: 8.0 ≤ CC < 12.0, CUDA ≥ 12.6 -> > - FastDeploy: 8.0 ≤ CC < 12.0, CUDA ≥ 12.6 +> > - PaddlePaddle:CC ≥ 7.0, CUDA ≥ 11.8 +> > - Transformers:CC ≥ 7.0, CUDA ≥ 11.8 +> > - vLLM:CC ≥ 8.0, CUDA ≥ 12.6 +> > - SGLang:8.0 ≤ CC < 12.0, CUDA ≥ 12.6 +> > - FastDeploy:8.0 ≤ CC < 12.0, CUDA ≥ 12.6 > > - CC ≥ 8 的常见显卡包括 RTX 30/40/50 系列及 A10/A100 等,更多型号可查看 [CUDA GPU 计算能力](https://developer.nvidia.cn/cuda-gpus) > - 虽然 vLLM 可在 T4/V100 等 CC 7.x 的 NVIDIA GPU 上启动,但容易出现超时或 OOM,不推荐使用。 > - vLLM、SGLang 和 FastDeploy 无法在 Windows 上原生运行,请使用我们提供的 Docker 镜像。 - -由于不同硬件所需的依赖各不相同,如果您的硬件满足上述表格的要求,请参考下表查看对应的使用教程: - -| 硬件类型 | 使用教程 | -|-----------------|--------------------------------------------------| -| x64 CPU | 本教程(当前仅支持手动安装依赖) | -| 英伟达 GPU | - NVIDIA Blackwell 架构 GPU(如RTX 50 系)参考 [PaddleOCR-VL NVIDIA Blackwell 架构 GPU 使用教程](./PaddleOCR-VL-NVIDIA-Blackwell.md)
        - 其他 NVIDIA GPU 参考本教程 | -| 昆仑芯 XPU | [PaddleOCR-VL 昆仑芯 XPU 使用教程](./PaddleOCR-VL-Kunlunxin-XPU.md) | -| 海光 DCU | [PaddleOCR-VL 海光 DCU 使用教程](./PaddleOCR-VL-Hygon-DCU.md) | -| 沐曦 GPU | [PaddleOCR-VL 沐曦 GPU 使用教程](./PaddleOCR-VL-MetaX-GPU.md) | -| 天数 GPU | [PaddleOCR-VL 天数 GPU 使用教程](./PaddleOCR-VL-Iluvatar-GPU.md) | -| 华为昇腾 NPU | [PaddleOCR-VL 华为昇腾 NPU 使用教程](./PaddleOCR-VL-Huawei-Ascend-NPU.md) | -| Apple Silicon | [PaddleOCR-VL Apple Silicon 使用教程](./PaddleOCR-VL-Apple-Silicon.md) | -| AMD GPU | [PaddleOCR-VL AMD GPU 使用教程](./PaddleOCR-VL-AMD-GPU.md) | -| Intel Arc GPU | [PaddleOCR-VL Intel Arc GPU 使用教程](./PaddleOCR-VL-Intel-Arc-GPU.md) | - -> TIP: -> 例如您使用的是 RTX 50 系 GPU,满足 PaddlePaddle 和 vLLM 推理方式的设备要求,请参考 [PaddleOCR-VL NVIDIA Blackwell 架构 GPU 使用教程](./PaddleOCR-VL-NVIDIA-Blackwell.md) 了解相关配置和用法。 +> - 由于不同库之间存在依赖冲突,使用 Transformers + vLLM 等混合推理方式时需将版面检测模型和 VLM 服务部署在不同环境中。 ## 1. 环境准备 -此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境,有以下两种方式,任选一种即可: +此步骤主要介绍如何搭建 PaddleOCR-VL 的运行环境。本教程主要适用于 **x64 CPU** 和 **除 Blackwell 之外的 NVIDIA GPU** 用户,其他硬件请优先参考上文列出的专用教程。 + +在本教程中,有以下两种环境准备方式: -- 方法一:使用官方 Docker 镜像。 +- 方法一:使用官方 Docker 镜像(仅适用于 NVIDIA GPU)。 -- 方法二:手动安装 PaddlePaddle 和 PaddleOCR。 +- 方法二:手动安装推理引擎和 PaddleOCR(x64 CPU 和 NVIDIA GPU 均可使用)。 **我们强烈推荐采用 Docker 镜像的方式,以最大程度减少可能出现的环境问题。** @@ -214,12 +310,14 @@ docker load -i paddleocr-vl-latest-nvidia-gpu-offline.tar # 之后可以在离线机器上使用 `docker run` 启动容器 ``` +镜像中仅预装飞桨框架,未安装其他推理引擎(如 Transformers)。如果希望使用其他推理引擎,建议采用方法二手动安装(不建议在预装飞桨框架的环境中安装)。 + > TIP: -> 标签后缀为 `latest-xxx` 的镜像对应 PaddleOCR 的最新版本。如果希望使用特定版本的 PaddleOCR 镜像,可以将标签中的 `latest` 替换为对应版本号:`paddleocr.`。 +> 标签后缀为 `latest-xxx` 的镜像对应最新版本。如果希望使用特定版本的镜像,可以将标签中的 `latest` 替换为 PaddleOCR 版本号:`paddleocr.`。 > 例如: > `ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-vl:paddleocr3.3-nvidia-gpu-offline` -### 1.2 方法二:手动安装 PaddlePaddle 和 PaddleOCR +### 1.2 方法二:手动安装推理引擎和 PaddleOCR 如果您无法使用 Docker,也可以手动安装 PaddlePaddle 和 PaddleOCR。要求 Python 版本为 3.8–3.13。 @@ -232,57 +330,84 @@ python -m venv .venv_paddleocr source .venv_paddleocr/bin/activate ``` -执行如下命令完成安装: +请先根据所选推理引擎安装对应依赖: + +- 使用 PaddlePaddle 推理时:请安装 3.2.1 及以上版本的 PaddlePaddle。常见安装方式如下(**注意不允许同时安装 CPU 和 GPU 版本的 PaddlePaddle**): ```shell -# 以下命令安装 CUDA 12.6 版本的 PaddlePaddle,对于其他 CUDA 版本以及 CPU 版本,请参考 https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html +# 英伟达 GPU(以 CUDA 12.6 为例) python -m pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ -python -m pip install -U "paddleocr[doc-parser]" + +# x64 CPU +python -m pip install paddlepaddle==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ ``` -> IMPORTANT: -> **请注意安装 3.2.1 及以上版本的飞桨框架。** + 对于其他 CUDA 版本,请参考飞桨安装文档:[https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) + +- 使用 `transformers` 推理时:请参考 [Transformers 官方文档](https://huggingface.co/docs/transformers/installation) 安装 `transformers` 及其依赖的底层推理框架。 + +完成推理引擎安装后,再执行如下命令安装 PaddleOCR-VL 所需的基础包: + +```shell +python -m pip install -U "paddleocr[doc-parser]" +``` ## 2. 快速开始 此步骤主要介绍如何使用 PaddleOCR-VL,包括如何通过 CLI 命令行方式和 Python API 方式进行使用。 -PaddleOCR-VL 支持 CLI 命令行方式和 Python API 两种使用方式,其中 CLI 命令行方式更简单,适合快速验证功能,而 Python API 方式更灵活,适合集成到现有项目中。 +PaddleOCR-VL 支持 CLI 命令行方式和 Python API 两种使用方式,其中 CLI 命令行方式更简单,适合快速验证功能,而 Python API 方式更灵活,适合集成到现有项目中。下文示例默认使用飞桨框架推理;如需切换到 `transformers` 引擎,可在 CLI 中追加 `--engine transformers`,或在 Python API 初始化时传入 `engine="transformers"`。 > IMPORTANT: -> 本节所介绍的方法主要用于快速验证,其推理速度、显存占用及稳定性表现未必能满足生产环境的要求。**若需部署至生产环境,我们强烈建议使用专门的推理加速框架** ,具体方法请参考下一节。 +> 本节所介绍的方法主要用于快速验证,其推理速度、显存占用及稳定性表现未必能满足生产环境的要求。**若需部署至生产环境,我们强烈建议使用专门的 VLM 推理服务**,具体方法请参考下一节。 ### 2.1 命令行方式体验 -一行命令即可快速体验 PaddleOCR-VL 效果: +首次运行时,PaddleOCR-VL 会自动下载官方模型,请确保当前环境可以联网,并预留一定的下载和初始化时间。 + +如果您想直接使用本文中的本地图像示例,可先下载测试图片: + +```shell +curl -L -o paddleocr_vl_demo.png https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png +``` + +下面给出一组可直接复制的示例命令。建议首次体验时附加 `--save_path ./output`,便于在当前目录下查看保存结果: ```shell # 英伟达 GPU -paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png +paddleocr doc_parser -i ./paddleocr_vl_demo.png --save_path ./output # 昆仑芯 XPU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device xpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device xpu --save_path ./output # 海光 DCU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device dcu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device dcu --save_path ./output # 沐曦 GPU -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device metax_gpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device metax_gpu --save_path ./output # Apple Silicon -paddleocr doc_parser -i ./paddleocr_vl_demo.png --device cpu +paddleocr doc_parser -i ./paddleocr_vl_demo.png --device cpu --save_path ./output # 华为昇腾 NPU # 华为昇腾 NPU 请参考第 3 章节使用 PaddlePaddle + vLLM 的方式进行推理 # 通过 --use_doc_orientation_classify 指定是否使用文档方向分类模型 -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_orientation_classify True +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_orientation_classify True --save_path ./output # 通过 --use_doc_unwarping 指定是否使用文本图像矫正模块 -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_unwarping True +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_doc_unwarping True --save_path ./output # 通过 --use_layout_detection 指定是否使用版面区域检测排序模块 -paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False +paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False --save_path ./output +``` + +执行成功后,终端会打印结构化结果;如果设置了 `--save_path ./output`,结果文件也会保存到当前目录下的 `output` 中,便于继续查看和调试。 + +若需切换到 `transformers` 引擎,可参考以下示例: + +```bash +paddleocr doc_parser -i ./paddleocr_vl_demo.png --engine transformers --save_path ./output ```
        命令行支持更多参数设置,点击展开以查看命令行参数的详细说明 @@ -367,7 +492,8 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False
      • large,设置为large时,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留外部最大的框,删除重叠的内部框;
      • small,设置为small,表示在模型输出的检测框中,对于互相重叠包含的检测框,只保留内部被包含的小框,删除重叠的外部框;
      • union,不进行框的过滤处理,内外框都保留;
      • -
      如果不设置,将使用初始化的参数值。 +
    +如果不设置,将使用初始化的参数值。 str @@ -580,12 +706,19 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False
  • DCU:如 dcu:0 表示使用第 1 块 DCU 进行推理;
  • 沐曦 GPU:如 metax_gpu:0 表示使用第 1 块沐曦 GPU 进行推理;
  • 天数 GPU:如 iluvatar_gpu:0 表示使用第 1 块天数 GPU 进行推理;
  • -如果不设置,将使用初始化的默认值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 + +如果不设置,将使用初始化的默认值,初始化时,会优先使用本地的 GPU 0号设备,如果没有,则使用 CPU 设备。 str +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见
    推理引擎与配置说明。 +str|None +None + + enable_hpi 含义:是否启用高性能推理。 bool @@ -599,13 +732,13 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False 对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    bool - +False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str - +fp32 enable_mkldnn @@ -614,7 +747,7 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False 如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool - +True mkldnn_cache_capacity @@ -622,13 +755,13 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False 含义:MKL-DNN 缓存容量。 int - +10 cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int - +10 paddlex_config @@ -636,6 +769,7 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False str +
    @@ -648,17 +782,22 @@ paddleocr doc_parser -i ./paddleocr_vl_demo.png --use_layout_detection False {'res': {'input_path': 'paddleocr_vl_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_chart_recognition': False, 'format_block_content': False}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 6, 'label': 'doc_title', 'score': 0.9636914134025574, 'coordinate': [np.float32(131.31366), np.float32(36.450516), np.float32(1384.522), np.float32(127.984665)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9281806349754333, 'coordinate': [np.float32(585.39465), np.float32(158.438), np.float32(930.2184), np.float32(182.57469)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840355515480042, 'coordinate': [np.float32(9.023666), np.float32(200.86115), np.float32(361.41583), np.float32(343.8828)]}, {'cls_id': 14, 'label': 'image', 'score': 0.9871416091918945, 'coordinate': [np.float32(775.50574), np.float32(200.66502), np.float32(1503.3807), np.float32(684.9304)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9801855087280273, 'coordinate': [np.float32(9.532196), np.float32(344.90594), np.float32(361.4413), np.float32(440.8244)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9708921313285828, 'coordinate': [np.float32(28.040405), np.float32(455.87976), np.float32(341.7215), np.float32(520.7117)]}, {'cls_id': 24, 'label': 'vision_footnote', 'score': 0.9002962708473206, 'coordinate': [np.float32(809.0692), np.float32(703.70044), np.float32(1488.3016), np.float32(750.5238)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825374484062195, 'coordinate': [np.float32(8.896561), np.float32(536.54895), np.float32(361.05237), np.float32(655.8058)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822263717651367, 'coordinate': [np.float32(8.971573), np.float32(657.4949), np.float32(362.01715), np.float32(774.625)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9767460823059082, 'coordinate': [np.float32(9.407074), np.float32(776.5216), np.float32(361.31067), np.float32(846.82874)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9868153929710388, 'coordinate': [np.float32(8.669495), np.float32(848.2543), np.float32(361.64703), np.float32(1062.8568)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9826608300209045, 'coordinate': [np.float32(8.8025055), np.float32(1063.8615), np.float32(361.46588), np.float32(1182.8524)]}, {'cls_id': 22, 'label': 'text', 'score': 0.982555627822876, 'coordinate': [np.float32(8.820602), np.float32(1184.4663), np.float32(361.66394), np.float32(1302.4507)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9584776759147644, 'coordinate': [np.float32(9.170288), np.float32(1304.2161), np.float32(361.48898), np.float32(1351.7483)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9782056212425232, 'coordinate': [np.float32(389.1618), np.float32(200.38202), np.float32(742.7591), np.float32(295.65146)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9844875931739807, 'coordinate': [np.float32(388.73303), np.float32(297.18463), np.float32(744.00024), np.float32(441.3034)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9680547714233398, 'coordinate': [np.float32(409.39468), np.float32(455.89386), np.float32(721.7174), np.float32(520.9387)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9741666913032532, 'coordinate': [np.float32(389.71606), np.float32(536.8138), np.float32(742.7112), np.float32(608.00165)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840384721755981, 'coordinate': [np.float32(389.30988), np.float32(609.39636), np.float32(743.09247), np.float32(750.3231)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9845995306968689, 'coordinate': [np.float32(389.13272), np.float32(751.7772), np.float32(743.058), np.float32(894.8815)]}, {'cls_id': 22, 'label': 'text', 'score': 0.984852135181427, 'coordinate': [np.float32(388.83267), np.float32(896.0371), np.float32(743.58215), np.float32(1038.7345)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9804865717887878, 'coordinate': [np.float32(389.08478), np.float32(1039.9119), np.float32(742.7585), np.float32(1134.4897)]}, {'cls_id': 22, 'label': 'text', 'score': 0.986461341381073, 'coordinate': [np.float32(388.52643), np.float32(1135.8137), np.float32(743.451), np.float32(1352.0085)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9869391918182373, 'coordinate': [np.float32(769.8341), np.float32(775.66235), np.float32(1124.9813), np.float32(1063.207)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822869896888733, 'coordinate': [np.float32(770.30383), np.float32(1063.938), np.float32(1124.8295), np.float32(1184.2192)]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9689218997955322, 'coordinate': [np.float32(791.3042), np.float32(1199.3169), np.float32(1104.4521), np.float32(1264.6985)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9713128209114075, 'coordinate': [np.float32(770.4253), np.float32(1279.6072), np.float32(1124.6917), np.float32(1351.8672)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9236552119255066, 'coordinate': [np.float32(1153.9058), np.float32(775.5814), np.float32(1334.0654), np.float32(798.1581)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9857938885688782, 'coordinate': [np.float32(1151.5197), np.float32(799.28015), np.float32(1506.3619), np.float32(991.1156)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9820687174797058, 'coordinate': [np.float32(1151.5686), np.float32(991.91095), np.float32(1506.6023), np.float32(1110.8875)]}, {'cls_id': 22, 'label': 'text', 'score': 0.9866049885749817, 'coordinate': [np.float32(1151.6919), np.float32(1112.1301), np.float32(1507.1611), np.float32(1351.9504)]}]}}} -运行结果参数说明可以参考[2.2 Python脚本方式集成](#22-python)中的结果解释。 +运行结果及保存接口的详细说明可参考 [2.2 Python脚本方式集成](#22-python) 中的结果解释。 -注:由于 PaddleOCR-VL 的默认模型较大,推理速度可能较慢,建议实际推理使用 [3. 使用推理加速框架提升 VLM 推理性能](#3-vlm) 方式进行快速推理。 +注:由于 PaddleOCR-VL 的默认模型较大,推理速度可能较慢,建议实际推理使用 [3. 使用 VLM 推理服务提升推理性能](#3-vlm) 方式进行快速推理。 ### 2.2 Python脚本方式集成 -命令行方式是为了快速体验查看效果,一般来说,在项目中,往往需要通过代码集成,您可以通过几行代码即可完成 PaddleOCR-VL 的快速推理,推理代码如下: +命令行方式是为了快速体验查看效果,一般来说,在项目中,往往需要通过代码集成。您可以通过几行代码即可完成 PaddleOCR-VL 的快速推理: ```python +from pathlib import Path + from paddleocr import PaddleOCRVL +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) + # 英伟达 GPU pipeline = PaddleOCRVL() # 昆仑芯 XPU @@ -679,18 +818,40 @@ pipeline = PaddleOCRVL() output = pipeline.predict("./paddleocr_vl_demo.png") for res in output: res.print() ## 打印预测的结构化输出 - res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 - res.save_to_markdown(save_path="output") ## 保存当前图像的markdown格式的结果 - res.save_to_word(save_path="output") ## 保存当前图像的Word格式的结果 + res.save_to_json(save_path=output_dir) ## 保存当前图像的结构化json结果 + res.save_to_markdown(save_path=output_dir) ## 保存当前图像的markdown格式的结果 + res.save_to_word(save_path=output_dir) ## 保存当前图像的Word格式的结果 +``` + +若需切换到 `transformers` 引擎,可参考以下示例: + +```python +from pathlib import Path + +from paddleocr import PaddleOCRVL + +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) + +pipeline = PaddleOCRVL(engine="transformers") +output = pipeline.predict("./paddleocr_vl_demo.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_json(save_path=output_dir) ## 保存当前图像的结构化json结果 + res.save_to_markdown(save_path=output_dir) ## 保存当前图像的markdown格式的结果 + res.save_to_word(save_path=output_dir) ## 保存当前图像的Word格式的结果 ``` 如果是 PDF 文件,会将 PDF 的每一页单独处理,每一页的 Markdown 文件也会对应单独的结果。如果您希望对多页的推理结果进行跨页表格合并、重建多级标题和合并多页结果等需求,可以通过如下方式实现: ```python +from pathlib import Path + from paddleocr import PaddleOCRVL input_file = "./your_pdf_file.pdf" -output_path = Path("./output") +output_dir = Path("./output") +output_dir.mkdir(parents=True, exist_ok=True) pipeline = PaddleOCRVL() @@ -706,8 +867,8 @@ output = pipeline.restructure_pages(pages_res) for res in output: res.print() ## 打印预测的结构化输出 - res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 - res.save_to_markdown(save_path="output") ## 保存当前图像的markdown格式的结果 + res.save_to_json(save_path=output_dir) ## 保存当前图像的结构化json结果 + res.save_to_markdown(save_path=output_dir) ## 保存当前图像的markdown格式的结果 ``` 如果您需要处理多个文件,**建议将包含文件的目录路径,或者文件路径列表传入 `predict` 方法**,以最大化处理效率。例如: @@ -970,11 +1131,24 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] str|None None + +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -988,7 +1162,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -1013,7 +1187,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1021,6 +1195,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] str|None None + @@ -1149,7 +1324,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] prompt_label -含义:VL模型的 prompt 类型设置。
    +含义:VL模型的 prompt 类型设置。
    说明: 当且仅当 use_layout_detection=False 时生效。可填写参数为 ocrformulatablesealchartspottingstr|None @@ -1226,7 +1401,8 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
  • formula_max_pixels:公式最大分辨率
  • seal_min_pixels:印章最小分辨率
  • seal_max_pixels:印章最大分辨率
  • - + + dict|None None @@ -1352,7 +1528,6 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] False - save_to_html() 将文件中的表格保存为html格式的文件 save_path @@ -1368,14 +1543,6 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] 保存的文件路径,支持目录或文件路径。 无 - -save_to_word() -将版面解析结果保存为Word (.docx) 格式的文件 -save_path -str -保存的文件路径,支持目录或文件路径 -无 -
      @@ -1384,7 +1551,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
    • input_path: (str) 待预测图像的输入路径
    • page_index: (Union[int, None]) 如果输入是PDF文件,则表示当前是PDF的第几页,否则为 None
    • model_settings: (Dict[str, bool]) 配置产线所需的模型参数 -
        +
        1. use_doc_preprocessor: (bool) 控制是否启用文档预处理子产线
        2. use_layout_detection: (bool) 控制是否启用版面检测模块
        3. use_chart_recognition: (bool) 控制是否开启图表识别功能
        4. @@ -1393,8 +1560,8 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
        5. doc_preprocessor_res: (Dict[str, Union[str, Dict[str, bool], int]]) 文档预处理子产线的输出结果。仅当use_doc_preprocessor=True时存在
            -
          1. input_path: (str) 文档预处理子接受的图像路径,当输入为numpy.ndarray时,保存为None,此处为None
          2. -
          3. page_index: None 此处的输入为numpy.ndarray时,所以值为None
          4. +
          5. input_path: (str) 文档预处理子接受的图像路径,当输入为numpy.ndarray时,保存为None,此处为None
          6. +
          7. page_index: None 此处的输入为numpy.ndarray时,所以值为None
          8. model_settings: (Dict[str, bool]) 文档预处理子的模型配置参数
            • use_doc_orientation_classify: (bool) 控制是否启用文档方向分类
            • @@ -1404,23 +1571,23 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
            • angle: (int) 文档图像方向分类子模块的预测结果,启用时返回实际角度值
        6. -
        7. parsing_res_list: (List[Dict]) 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
        8. +
        9. parsing_res_list: (List[Dict]) 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
          1. block_bbox: (np.ndarray) 版面区域的边界框。
          2. -
          3. block_label: (str) 版面区域的标签,例如text, table
          4. +
          5. block_label: (str) 版面区域的标签,例如texttable
          6. block_content: (str) 内容为版面区域内的内容。
          7. block_id: (int) 版面区域的索引,用于显示版面排序结果。
          8. -
          9. block_order: (int) 版面区域的顺序,用于显示版面阅读顺序,对于非排序部分,默认值为 None
          10. +
          11. block_order: (int) 版面区域的顺序,用于显示版面阅读顺序,对于非排序部分,默认值为 None
        -
      1. 调用save_to_json() 方法会将上述内容保存到指定的save_path中,如果指定为目录,则保存的路径为save_path/{your_img_basename}_res.json,如果指定为文件,则直接保存到该文件中。由于json文件不支持保存numpy数组,因此会将其中的numpy.array类型转换为列表形式。json中的字段内容如下:
      2. +
      3. 调用save_to_json() 方法会将上述内容保存到指定的save_path中,如果指定为目录,则保存的路径为save_path/{your_img_basename}_res.json;如果指定为文件,则直接保存到该文件中。由于 JSON 文件不支持保存 numpy 数组,因此会将其中的numpy.array类型转换为列表形式。JSON 中的字段内容如下:
        1. input_path: (str) 待预测图像的输入路径
        2. page_index: (Union[int, None]) 如果输入是PDF文件,则表示当前是PDF的第几页,否则为 None
        3. model_settings: (Dict[str, bool]) 配置产线所需的模型参数 -
            +
            1. use_doc_preprocessor: (bool) 控制是否启用文档预处理子产线
            2. use_layout_detection: (bool) 控制是否启用版面检测模块
            3. use_chart_recognition: (bool) 控制是否开启图表识别功能
            4. @@ -1429,8 +1596,8 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
            5. doc_preprocessor_res: (Dict[str, Union[str, Dict[str, bool], int]]) 文档预处理子产线的输出结果。仅当use_doc_preprocessor=True时存在
                -
              1. input_path: (str) 文档预处理子接受的图像路径,当输入为numpy.ndarray时,保存为None,此处为None
              2. -
              3. page_index: None 此处的输入为numpy.ndarray时,所以值为None
              4. +
              5. input_path: (str) 文档预处理子接受的图像路径,当输入为numpy.ndarray时,保存为None,此处为None
              6. +
              7. page_index: None 此处的输入为numpy.ndarray时,所以值为None
              8. model_settings: (Dict[str, bool]) 文档预处理子的模型配置参数
                • use_doc_orientation_classify: (bool) 控制是否启用文档方向分类
                • @@ -1440,63 +1607,62 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"]
                • angle: (int) 文档图像方向分类子模块的预测结果,启用时返回实际角度值
            6. -
            7. parsing_res_list: (List[Dict]) 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
            8. +
            9. parsing_res_list: (List[Dict]) 解析结果的列表,每个元素为一个字典,列表顺序为解析后的阅读顺序。
              1. block_bbox: (np.ndarray) 版面区域的边界框。
              2. -
              3. block_label: (str) 版面区域的标签,例如text, table
              4. +
              5. block_label: (str) 版面区域的标签,例如texttable
              6. block_content: (str) 内容为版面区域内的内容。
              7. block_id: (int) 版面区域的索引,用于显示版面排序结果。
              8. -
              9. block_order: (int) 版面区域的顺序,用于显示版面阅读顺序,对于非排序部分,默认值为 None
              10. +
              11. block_order: (int) 版面区域的顺序,用于显示版面阅读顺序,对于非排序部分,默认值为 None
            -
          1. 调用save_to_img() 方法会将可视化结果保存到指定的save_path中,如果指定为目录,则会将版面区域检测可视化图像、全局OCR可视化图像、版面阅读顺序可视化图像等内容保存,如果指定为文件,则直接保存到该文件中。
          2. -
          3. 调用save_to_markdown() 方法会将转化后的 Markdown 文件保存到指定的save_path中,保存的文件路径为save_path/{your_img_basename}.md,如果输入是 PDF 文件,建议直接指定目录,否责多个 markdown 文件会被覆盖。
          4. -
          5. 此外,也支持通过属性获取带结果的可视化图像和预测结果,具体如下: - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            属性属性说明
            json获取预测的 json 格式的结果
            img获取格式为 dict 的可视化图像
            markdown获取格式为 dict 的 markdown 结果
            - -
              -
            • json 属性获取的预测结果为dict类型的数据,相关内容与调用 save_to_json() 方法保存的内容一致。
            • -
            • img 属性返回的预测结果是一个dict类型的数据。其中,键分别为 ocr_res_imgpreprocessed_img,对应的值是两个 Image.Image 对象:一个用于显示 OCR 结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则dict中只包含 ocr_res_img
            • +
            • 调用save_to_img() 方法会将可视化结果保存到指定的save_path中,如果指定为目录,则会将版面区域检测可视化图像、全局OCR可视化图像、版面阅读顺序可视化图像等内容保存;如果指定为文件,则直接保存到该文件中。
            • +
            • 调用save_to_markdown() 方法会将转化后的 Markdown 文件保存到指定的save_path中,保存的文件路径为save_path/{your_img_basename}.md。如果输入是 PDF 文件,建议直接指定目录,否则多个 Markdown 文件会被覆盖。
            • +
            • 此外,也支持通过属性获取带结果的可视化图像和预测结果,具体如下: + + + + + + + + + + + + + + + + + + + + + +
              属性属性说明
              json获取预测的 json 格式的结果
              img获取格式为 dict 的可视化图像
              markdown获取格式为 dict 的 markdown 结果
              +
                +
              • json 属性获取的预测结果为 dict 类型的数据,相关内容与调用 save_to_json() 方法保存的内容一致。
              • +
              • img 属性返回的预测结果是一个 dict 类型的数据。其中,键分别为 ocr_res_imgpreprocessed_img,对应的值是两个 Image.Image 对象:一个用于显示 OCR 结果的可视化图像,另一个用于展示图像预处理的可视化图像。如果没有使用图像预处理子模块,则 dict 中只包含 ocr_res_img
              • +
              • markdown 属性获取的预测结果为 dict 类型的数据,相关内容与调用 save_to_markdown() 方法保存的内容一致。
              • +
              +
            -
          6. -## 3. 使用推理加速框架提升 VLM 推理性能 + +## 3. 使用 VLM 推理服务提升推理性能 -默认配置下的推理性能未经过充分优化,可能无法满足实际生产需求。此步骤主要介绍如何使用 vLLM、SGLang 和 FastDeploy 推理加速框架来提升 PaddleOCR-VL 的推理性能。 +只使用 PaddlePaddle 或 Transformers 通常无法得到最优的推理性能。此步骤主要介绍如何通过 VLM 推理服务提升 PaddleOCR-VL 的推理性能。您既可以自行部署基于 vLLM、SGLang、FastDeploy、MLX-VLM、llama.cpp 等后端的 VLM 推理服务,也可以直接使用兼容的托管服务。这一节对应“版面检测推理方式 + VLM 推理服务”类组合,其核心思路是:**客户端继续负责版面检测等完整流程中的其他环节,仅将 VLM 推理交给专用服务处理。** ### 3.1 启动 VLM 推理服务 +> IMPORTANT: +> 按照本节说明启动的服务仅负责 PaddleOCR-VL 流程中的 VLM 推理环节,不提供完整的端到端文档解析 API。强烈不建议直接通过 HTTP 请求或使用 OpenAI 客户端调用该服务处理文档图像。若您需要部署具备 PaddleOCR-VL 完整能力的服务,请参考后文的服务化部署部分。 + 启动 VLM 推理服务有以下三种方式,任选一种即可: - 方法一:使用官方 Docker 镜像启动服务,目前支持: @@ -1520,7 +1686,7 @@ output = pipeline.predict(["imgs/file1.png", "imgs/file2.png", "imgs/file3.png"] #### 3.1.1 方法一:使用 Docker 镜像 -PaddleOCR 提供了 Docker 镜像,用于快速启动 vLLM 或 FastDeploy 推理服务。可使用以下命令启动服务(要求 Docker 版本 >= 19.03,机器装配有 GPU 且 NVIDIA 驱动支持 CUDA 12.6 或以上版本): +PaddleOCR 提供了 Docker 镜像,用于快速启动 vLLM 或 FastDeploy 推理服务。可使用以下命令启动服务(要求 Docker 版本 >= 19.03,机器装配有 GPU 且英伟达驱动支持 CUDA 12.6 或以上版本): === "启动 vLLM 服务" @@ -1558,11 +1724,13 @@ docker run \ --rm \ --gpus all \ --network host \ - -v vllm_config.yml:/tmp/vllm_config.yml \ + -v vllm_config.yml:/tmp/vllm_config.yml \ ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest-nvidia-gpu \ paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8118 --backend vllm --backend_config /tmp/vllm_config.yml ``` +其中,`vllm_config.yml` 表示宿主机上的本地配置文件路径。示例中假设您在当前目录下创建了该文件;如果文件位于其他位置,请替换为实际绝对路径或相对路径。 + > TIP: > 标签后缀为 `latest-xxx` 的镜像对应 PaddleOCR 的最新版本。如果希望使用特定版本的 PaddleOCR 镜像,可以将标签中的 `latest` 替换为对应版本号:`paddleocr.`。 > 例如: @@ -1572,7 +1740,7 @@ docker run \ **PaddleOCR CLI 已经为您解决了复杂的版本兼容性问题。您无需花费时间研究推理框架的文档,只需一条简单的命令即可安装推理框架所需的依赖环境。** -由于推理加速框架可能与飞桨框架存在依赖冲突,建议在虚拟环境中安装: +由于推理加速框架可能与当前环境中的包存在依赖冲突,建议在虚拟环境中安装: ```shell # 如果当前存在已激活的虚拟环境,先通过 `deactivate` 取消激活 @@ -1601,7 +1769,10 @@ paddleocr install_genai_server_deps <推理加速框架名称> 当前支持的框架名称为 `vllm`、`sglang` 和 `fastdeploy`,分别对应 vLLM、SGLang 和 FastDeploy。 -通过 `paddleocr install_genai_server_deps` 安装的 vLLM 与 SGLang 均为 **CUDA 12.6** 版本,请确保本地 NVIDIA 驱动与此版本一致或更高。 +通过 `paddleocr install_genai_server_deps` 安装的 vLLM 与 SGLang 均为 **CUDA 12.6** 版本,请确保本地英伟达驱动与此版本一致或更高。 + +> WARNING: +> 目前 vLLM 和 SGLang 与 Transformers 引擎所需的 transformers 库版本存在冲突,因此同一环境中无法同时安装 Transformers 引擎与 vLLM 或 SGLang。如果使用 Transformers + vLLM 或 Transformers + SGLang 的推理方式,请将版面检测模型和 VLM 服务部署在不同环境中。 安装完成后,可通过 `paddleocr genai_server` 命令启动服务: @@ -1617,7 +1788,7 @@ paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --backend vllm --port | `--model_dir` | 模型目录 | | `--host` | 服务器主机名 | | `--port` | 服务器端口号 | -| `--backend` | 后端名称,即使用的推理加速框架名称,可选 `vllm` 或 `sglang` | +| `--backend` | 后端名称,即使用的推理加速框架名称,可选 `vllm`、`sglang` 或 `fastdeploy` | | `--backend_config` | 可指定 YAML 文件,包含后端配置 | #### 3.1.3 方法三:直接使用推理加速框架启动服务 @@ -1641,11 +1812,9 @@ paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --backend vllm --port --temp 0 ``` - - ### 3.2 客户端使用方法 -启动 VLM 推理服务后,客户端即可通过 PaddleOCR 调用该服务。**请注意,由于客户端需要调用版面检测模型,仍建议在 GPU 等加速设备上运行客户端,以获得更稳定和高效的性能。客户端环境配置请参考第 1 节,3.1 节介绍的环境配置仅适用于启动服务,不适用于客户端。** +启动 VLM 推理服务后,客户端即可通过 PaddleOCR 调用该服务。本节既适用于调用 3.1 中自建的 VLM 推理服务,也适用于调用第三方提供的兼容托管服务。**请注意,由于客户端仍需要调用版面检测模型并完成其他流程环节,仍建议在 GPU 等加速设备上运行客户端,以获得更稳定和高效的性能。客户端环境配置请参考第 1 节,3.1 节介绍的环境配置仅适用于启动服务,不适用于客户端。若您希望客户端只通过 HTTP 接口调用 PaddleOCR-VL 的完整能力,请直接参考第 4 节“服务化部署”。** #### 3.2.1 CLI 调用 @@ -1735,7 +1904,7 @@ pipeline = PaddleOCRVL( ### 3.3 性能调优 -默认配置无法保证在所有环境中取得最优性能。如果用户在实际使用中遇到性能问题,可以尝试以下优化方法。 +默认配置无法保证在所有环境取得最优性能。如果您在实际使用中遇到性能问题,可以尝试以下优化方法。 #### 3.3.1 服务端参数调整 @@ -1817,7 +1986,7 @@ paddleocr-vl-api | INFO: Application startup complete. paddleocr-vl-api | INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) ``` -此方式基于 vLLM 等框架对 VLM 推理进行加速,更适合生产环境部署,但要求机器配备 GPU,并且 NVIDIA 驱动程序支持 CUDA 12.6 或以上版本。 +此方式基于 vLLM 等框架对 VLM 推理进行加速,更适合生产环境部署,但要求机器配备 GPU,并且英伟达驱动程序支持 CUDA 12.6 或以上版本。 此外,使用此方式启动服务器后,除拉取镜像外,无需连接互联网。如需在离线环境中部署,可先在联网机器上拉取 Compose 文件中涉及的镜像,导出并传输至离线机器中导入,即可在离线环境下启动服务。 @@ -1889,7 +2058,8 @@ Docker Compose 通过读取 `.env` 和 `compose.yaml` 文件中配置,先后 ```yaml paddleocr-vlm-server: ... - volumes: /path/to/your_config.yaml:/home/paddleocr/vlm_server_config.yaml + volumes: + - /path/to/your_config.yaml:/home/paddleocr/vlm_server_config.yaml command: paddleocr genai_server --model_name PaddleOCR-VL-1.5-0.9B --host 0.0.0.0 --port 8118 --backend vllm --backend_config /home/paddleocr/vlm_server_config.yaml ... ``` @@ -1921,6 +2091,8 @@ Docker Compose 通过读取 `.env` 和 `compose.yaml` 文件中配置,先后 执行以下命令,通过 PaddleX CLI 安装服务化部署插件: +> `paddlex` 命令会在安装 `paddleocr` 时一并安装,因此如果您已按前文完成 PaddleOCR 安装,通常无需额外安装 PaddleX。 + ```shell paddlex --install serving ``` @@ -1931,6 +2103,12 @@ paddlex --install serving paddlex --serve --pipeline PaddleOCR-VL ``` +如需在服务化部署中切换到 `transformers` 引擎,可参考如下示例: + +```shell +paddlex --serve --pipeline PaddleOCR-VL --engine transformers +``` + 启动后将看到类似如下输出,服务器默认监听 **8080** 端口: ```text @@ -2120,7 +2298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) layoutThreshold -number | object | null +number | object | null 请参阅PaddleOCR-VL对象中 predict 方法的 layout_threshold 参数相关说明。 否 @@ -2245,6 +2423,12 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) 否 +outputFormats +array | null +可选。需要额外返回的文档格式列表。默认不返回任何附加格式。当前仅支持 "docx"。 +否 + + visualize boolean | null 是否返回可视化结果图以及处理过程中的中间图像等。 @@ -2317,6 +2501,11 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) string | null 输入图像。图像为JPEG格式,使用Base64编码。 + +exports +object | null +可选的附加导出结果。仅当请求体中包含 outputFormats 且列出相应格式时出现。例如 {"docx": {"content": "..."}},其中 content 为文件内容的Base64编码。 +

            markdown为一个object,具有如下属性:

            @@ -2396,6 +2585,12 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) 输出的 Markdown 文本中是否包含公式编号。默认为 false。 否 + +outputFormats +array | null +可选。附加导出格式,含义与 infer 中的 outputFormats 相同。当前仅支持 "docx"。 +否 +

            pages中的每个元素为一个object,具有如下属性:

            @@ -2435,7 +2630,7 @@ INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) layoutParsingResults array -重构后的版面解析结果。其中每个元素包含的字段请参见对infer操作返回结果的说明(不含可视化结果图和中间图像)。 +重构后的版面解析结果。其中每个元素包含的字段请参见对 infer 操作返回结果的说明(不含可视化结果图和中间图像)。 diff --git a/docs/version3.x/pipeline_usage/doc_preprocessor.en.md b/docs/version3.x/pipeline_usage/doc_preprocessor.en.md index d05e0d1dd83..f11b18d6887 100644 --- a/docs/version3.x/pipeline_usage/doc_preprocessor.en.md +++ b/docs/version3.x/pipeline_usage/doc_preprocessor.en.md @@ -18,6 +18,7 @@ The Document Image Preprocessing Pipeline integrates two key functions: document In this pipeline, you can select the models to use based on the benchmark data provided below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Normal Mode / High-Performance Mode], the Normal Mode values correspond to the local `paddle_static` inference engine.
            Document Image Orientation Classification Module (Optional): @@ -95,7 +96,7 @@ In this pipeline, you can select the models to use based on the benchmark data p
          7. Software Environment:
            • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
            • -
            • paddlepaddle 3.0.0 / paddleocr 3.0.3
            • +
            • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
    @@ -117,7 +118,7 @@ In this pipeline, you can select the models to use based on the benchmark data p Normal Mode FP32 Precision / No TRT Acceleration FP32 Precision / 8 Threads - PaddleInference + paddle_static High-Performance Mode @@ -152,6 +153,18 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --use_doc_unwarping True paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu ``` +The examples above use the local `paddle_static` inference engine by default. To run them, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr doc_preprocessor -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/doc_test_rotated.jpg \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. +
    The command line supports more parameter settings. Click to expand for detailed explanations of command line parameters. @@ -249,56 +262,63 @@ If not set, the pipeline initialized value for this parameter will be used. Duri + + + + + + - + - + - - + - +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + - + - + - + +
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    enable_hpiMeaning:Whether to enable high-performance inference.Meaning: Whether to enable high-performance inference. boolFalseNone
    use_tensorrtMeaning:Whether to use the Paddle Inference TensorRT subgraph engine.
    +
    Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False
    precisionMeaning:The computational precision, such as fp32, fp16.Meaning: Computation precision, such as fp32 or fp16. str fp32
    enable_mkldnnMeaning:Whether to enable MKL-DNN acceleration for inference.
    +
    Meaning: Whether to enable MKL-DNN accelerated inference.
    Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set.
    bool True
    mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10
    cpu_threadsMeaning:The number of threads used for inference on the CPU.Meaning: Number of threads used for inference on CPU. int810
    paddlex_configMeaning:Path to PaddleX pipeline configuration file.Meaning: Path to the PaddleX pipeline configuration file. str
    @@ -332,6 +352,28 @@ for res in output: res.save_to_json("./output/") ``` +The example above uses the local `paddle_static` inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import DocPreprocessor + +pipeline = DocPreprocessor( + engine="transformers", +) +# docpp = DocPreprocessor(use_doc_orientation_classify=True) # Specify whether to use the document orientation classification model via use_doc_orientation_classify +# docpp = DocPreprocessor(use_doc_unwarping=True) # Specify whether to use the text image unwarping module via use_doc_unwarping +# docpp = DocPreprocessor(device="gpu") # Specify whether to use GPU for model inference via device +output = pipeline.predict("./doc_test_rotated.jpg") +for res in output: + res.print() ## Print the structured output of the prediction + res.save_to_img("./output/") + res.save_to_json("./output/") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + In the above Python script, the following steps are executed: (1) Instantiate the doc_preprocessor pipeline object via DocPreprocessor(). The specific parameter descriptions are as follows: @@ -414,57 +456,71 @@ Support for specifying specific card numbers: str|None None + +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
    +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False precision -Meaning:The computational precision, such as fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
    +Meaning: Whether to enable MKL-DNN accelerated inference.
    Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity.
    +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:The number of threads used for inference on the CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str|None None + diff --git a/docs/version3.x/pipeline_usage/doc_preprocessor.md b/docs/version3.x/pipeline_usage/doc_preprocessor.md index cc0993446c8..3abdc41de8e 100644 --- a/docs/version3.x/pipeline_usage/doc_preprocessor.md +++ b/docs/version3.x/pipeline_usage/doc_preprocessor.md @@ -18,6 +18,7 @@ comments: true 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地推理引擎 `paddle_static`。
    文档图像方向分类模块(可选): @@ -95,7 +96,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -117,7 +118,7 @@ comments: true 常规模式 FP32精度 / 无TRT加速 FP32精度 / 8线程 - PaddleInference + paddle_static 高性能模式 @@ -154,6 +155,18 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --use_doc_unwarping True paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu ``` +上述命令默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr doc_preprocessor -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/doc_test_rotated.jpg \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 +
    命令行支持更多参数设置,点击展开以查看命令行参数的详细说明 @@ -254,10 +267,16 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu + + + + + + - + @@ -265,14 +284,13 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu 说明: 如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    - - + @@ -297,7 +315,7 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu - + @@ -305,6 +323,7 @@ paddleocr doc_preprocessor -i ./doc_test_rotated.jpg --device gpu +
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    enable_hpi 含义:是否启用高性能推理。 boolFalseNone
    use_tensorrt bool False
    precision含义:计算精度,如 fp32、fp16。含义:计算精度,如 fp32fp16 str fp32
    cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int810
    paddlex_configstr
    @@ -339,6 +358,28 @@ for res in output: res.save_to_json("./output/") ``` +上述代码默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import DocPreprocessor + +pipeline = DocPreprocessor( + engine="transformers", +) +# docpp = DocPreprocessor(use_doc_orientation_classify=True) # 通过 use_doc_orientation_classify 指定是否使用文档方向分类模型 +# docpp = DocPreprocessor(use_doc_unwarping=True) # 通过 use_doc_unwarping 指定是否使用文本图像矫正模块 +# docpp = DocPreprocessor(device="gpu") # 通过 device 指定模型推理时使用 GPU +output = pipeline.predict("./doc_test_rotated.jpg") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img("./output/") + res.save_to_json("./output/") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 在上述 Python 脚本中,执行了如下几个步骤: (1)通过 DocPreprocessor() 实例化 doc_preprocessor 产线对象:具体参数说明如下: @@ -421,11 +462,24 @@ for res in output: + + + + + + + + + + + + + - + @@ -433,14 +487,13 @@ for res in output: 说明: 如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    - - + @@ -465,7 +518,7 @@ for res in output: - + @@ -473,6 +526,7 @@ for res in output: +
    str|None None
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    engine_config含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明
    dict|NoneNone
    enable_hpi 含义:是否启用高性能推理。 boolFalseNone
    use_tensorrt bool False
    precision含义:计算精度,如 fp32、fp16。含义:计算精度,如 "fp32""fp16" str "fp32"
    cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int810
    paddlex_configstr|None None
    diff --git a/docs/version3.x/pipeline_usage/doc_understanding.en.md b/docs/version3.x/pipeline_usage/doc_understanding.en.md index 7b888e385fb..a8b526cde48 100644 --- a/docs/version3.x/pipeline_usage/doc_understanding.en.md +++ b/docs/version3.x/pipeline_usage/doc_understanding.en.md @@ -139,8 +139,59 @@ Supports specifying a specific card number: +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, and paddle_dynamic. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str + + + +enable_hpi +Meaning: Whether to enable high-performance inference. +bool +None + + +use_tensorrt +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    + +bool +False + + +precision +Meaning: Computation precision, such as "fp32" or "fp16". +str +"fp32" + + +enable_mkldnn +Meaning: Whether to enable MKL-DNN accelerated inference.
    +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + +bool +True + + +mkldnn_cache_capacity + +Meaning: MKL-DNN cache capacity. + +int +10 + + +cpu_threads +Meaning: Number of threads used for inference on CPU. +int +10 + + paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str @@ -233,6 +284,18 @@ Supports specifying a specific card number: None +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, and paddle_dynamic. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + paddlex_config Meaning:Path to PaddleX pipeline configuration file. str|None diff --git a/docs/version3.x/pipeline_usage/doc_understanding.md b/docs/version3.x/pipeline_usage/doc_understanding.md index d9d0bd153f6..537f450e213 100644 --- a/docs/version3.x/pipeline_usage/doc_understanding.md +++ b/docs/version3.x/pipeline_usage/doc_understanding.md @@ -134,11 +134,63 @@ paddleocr doc_understanding -i "{'image': 'https://paddle-model-ecology.bj.bcebo +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_dynamic。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +enable_hpi +含义:是否启用高性能推理。 +bool +None + + +use_tensorrt +含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
    +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    +对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    + +bool +False + + +precision +含义:计算精度,如 fp32fp16。 +str +fp32 + + +enable_mkldnn +含义:是否启用 MKL-DNN 加速推理。
    +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 + +bool +True + + +mkldnn_cache_capacity + +含义:MKL-DNN 缓存容量。 + +int +10 + + +cpu_threads +含义:在 CPU 上进行推理时使用的线程数。 +int +10 + + paddlex_config 含义:PaddleX产线配置文件路径。 str +
    @@ -228,6 +280,18 @@ for res in output: None +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_dynamic。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + paddlex_config 含义:PaddleX产线配置文件路径。 str|None diff --git a/docs/version3.x/pipeline_usage/formula_recognition.en.md b/docs/version3.x/pipeline_usage/formula_recognition.en.md index 7c9f2911bab..d09622cd3dc 100644 --- a/docs/version3.x/pipeline_usage/formula_recognition.en.md +++ b/docs/version3.x/pipeline_usage/formula_recognition.en.md @@ -24,6 +24,7 @@ The formula recognition pipeline includes the following four modules. Each modul In this pipeline, you can choose the model you want to use based on the benchmark data provided below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Normal Mode / High-Performance Mode], the Normal Mode values correspond to the local `paddle_static` inference engine.
    Document Image Orientation Classification Module (Optional): @@ -376,7 +377,7 @@ In this pipeline, you can choose the model you want to use based on the benchmar
  • Software Environment:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -398,7 +399,7 @@ In this pipeline, you can choose the model you want to use based on the benchmar Normal Mode FP32 Precision / No TRT Acceleration FP32 Precision / 8 Threads - PaddleInference + paddle_static High-Performance Mode @@ -435,6 +436,18 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --device gpu ``` +The examples above use the local `paddle_static` inference engine by default. To run them, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following command: + +```bash +# Use the transformers engine for inference +paddleocr formula_recognition_pipeline -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png \ + --engine transformers +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. +
    The command line supports more parameter settings. Click to expand for detailed descriptions of the command line parameters. @@ -655,32 +668,38 @@ You can specify a particular card number: + + + + + + - + - + -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine. If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -Description: -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - + - + - @@ -688,24 +707,24 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no - + - + - + +
    engineMeaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration.
    str|NoneNone
    enable_hpiMeaning:Whether to enable the high-performance inference plugin.Meaning: Whether to enable high-performance inference. boolFalseNone
    use_tensorrtMeaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False
    precisionMeaning:Compute precision, such as FP32 or FP16.Meaning: Computation precision, such as fp32 or fp16. str fp32
    enable_mkldnnMeaning:Whether to enable MKL-DNN acceleration for inference.
    -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +
    Meaning: Whether to enable MKL-DNN accelerated inference.
    +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set.
    bool True
    mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10
    cpu_threads -Meaning:The number of threads to use when performing inference on the CPU.Meaning: Number of threads used for inference on CPU. int810
    paddlex_configMeaning:Path to PaddleX pipeline configuration file.Meaning: Path to the PaddleX pipeline configuration file. str
    @@ -751,6 +770,28 @@ for res in output: res.save_to_json(save_path="output") ## Save the structured JSON result of the current image ``` +The example above uses the local `paddle_static` inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline( + engine="transformers", +) +# ocr = FormulaRecognitionPipeline(use_doc_orientation_classify=True) # Specify whether to use the document orientation classification model with use_doc_orientation_classify. +# ocr = FormulaRecognitionPipeline(use_doc_unwarping=True) # Specify whether to use the text image unwarping module with use_doc_unwarping. +# ocr = FormulaRecognitionPipeline(device="gpu") # Specify the use of GPU for model inference with device. +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## Print the structured output of the prediction + res.save_to_img(save_path="output") ## Save the formula visualization result of the current image. + res.save_to_json(save_path="output") ## Save the structured JSON result of the current image +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + In the above Python script, the following steps are executed: (1)Instantiate the formula recognition pipeline object through create_pipeline(), with specific parameters as follows: @@ -953,33 +994,46 @@ You can specify a particular card number: str|None None + +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable the high-performance inference plugin. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine. If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -Description: -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False precision -Meaning:Compute precision, such as FP32 or FP16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
    -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
    +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. bool True @@ -987,23 +1041,24 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:The number of threads to use when performing inference on the CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str|None None + diff --git a/docs/version3.x/pipeline_usage/formula_recognition.md b/docs/version3.x/pipeline_usage/formula_recognition.md index 12b40151b2d..3aa9ec33111 100644 --- a/docs/version3.x/pipeline_usage/formula_recognition.md +++ b/docs/version3.x/pipeline_usage/formula_recognition.md @@ -22,6 +22,7 @@ comments: true 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地推理引擎 `paddle_static`。
    文档图像方向分类模块(可选): @@ -376,7 +377,7 @@ comments: true
  • 软件环境:
    • Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6
    • -
    • paddlepaddle 3.0.0 / paddleocr 3.0.3
    • +
    • paddlepaddle-gpu 3.0.0 / paddleocr 3.0.3
  • @@ -398,7 +399,7 @@ comments: true 常规模式 FP32精度 / 无TRT加速 FP32精度 / 8线程 - PaddleInference + paddle_static 高性能模式 @@ -436,6 +437,18 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png --device gpu ``` +上述命令默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下命令: + +```bash +# 使用 transformers 引擎进行推理 +paddleocr formula_recognition_pipeline -i https://paddle-model-ecology.bj.bcebos.com/paddlex/demo_image/pipelines/general_formula_recognition_001.png \ + --engine transformers +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 +
    命令行支持更多参数设置,点击展开以查看命令行参数的详细说明 @@ -625,31 +638,38 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png + + + + + + - + - + @@ -657,7 +677,7 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png @@ -666,7 +686,7 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png - + @@ -674,6 +694,7 @@ paddleocr formula_recognition_pipeline -i ./general_formula_recognition_001.png +
    engine含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明
    str|NoneNone
    enable_hpi 含义:是否启用高性能推理。 boolFalseNone
    use_tensorrt 含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
    -说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    -
    bool False
    precision计算精度,如 fp32、fp16。含义:计算精度,如 fp32fp16 str fp32
    enable_mkldnn 含义:是否启用 MKL-DNN 加速推理。
    -说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。
    bool True
    mkldnn_cache_capacity -含义:MKL-DNN 缓存容量。
    +含义:MKL-DNN 缓存容量。
    int 10cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int810
    paddlex_configstr
    @@ -718,6 +739,28 @@ for res in output: res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 ``` +上述代码默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import FormulaRecognitionPipeline + +pipeline = FormulaRecognitionPipeline( + engine="transformers", +) +# ocr = FormulaRecognitionPipeline(use_doc_orientation_classify=True) # 通过 use_doc_orientation_classify 指定是否使用文档方向分类模型 +# ocr = FormulaRecognitionPipeline(use_doc_unwarping=True) # 通过 use_doc_unwarping 指定是否使用文本图像矫正模块 +# ocr = FormulaRecognitionPipeline(device="gpu") # 通过 device 指定模型推理时使用 GPU +output = pipeline.predict("./general_formula_recognition_001.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img(save_path="output") ## 保存当前图像的公式可视化结果 + res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果 +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 在上述 Python 脚本中,执行了如下几个步骤: (1)通过 `FormulaRecognitionPipeline()` 实例化公式识别产线对象,具体参数说明如下: @@ -901,32 +944,46 @@ for res in output: str|None None + +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt 含义:是否启用 Paddle Inference 的 TensorRT 子图引擎。
    -说明:如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    +说明: +如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" enable_mkldnn 含义:是否启用 MKL-DNN 加速推理。
    -说明:如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 +说明: +如果 MKL-DNN 不可用或模型不支持通过 MKL-DNN 加速,即使设置了此标志,也不会使用加速。 bool True @@ -943,7 +1000,7 @@ for res in output: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -951,6 +1008,7 @@ for res in output: str|None None + diff --git a/docs/version3.x/pipeline_usage/seal_recognition.en.md b/docs/version3.x/pipeline_usage/seal_recognition.en.md index 2d9172334f9..5d11d7c5d88 100644 --- a/docs/version3.x/pipeline_usage/seal_recognition.en.md +++ b/docs/version3.x/pipeline_usage/seal_recognition.en.md @@ -22,6 +22,7 @@ The seal text recognition pipeline is used to recognize the text content of seal In this pipeline, you can choose the model to use based on the benchmark data below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Regular Mode / High-Performance Mode], the Regular Mode values correspond to the local `paddle_static` inference engine.
    Layout Region Detection Module (Optional): @@ -662,7 +663,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Engine and Configuration. +str|None +None + + enable_hpi -Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine.
    -Description: -If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False precision -Meaning:The computational precision, such as fp32, fp16. +Meaning: Computation precision, such as fp32 or fp16. str fp32 enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
    -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
    +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:The number of threads used for inference on the CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str +
    @@ -1077,6 +1099,26 @@ for res in output: res.save_to_json("./output/") ``` +The example above uses the local `paddle_static` inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import SealRecognition + +pipeline = SealRecognition( + engine="transformers", +) +# ocr = SealRecognition(device="gpu") # Specify GPU for model inference +output = pipeline.predict("./seal_text_det.png") +for res in output: + res.print() ## Print structured prediction results + res.save_to_img("./output/") + res.save_to_json("./output/") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + In the above Python script, the following steps were executed: (1) Instantiate a pipeline object for seal text recognition using the SealRecognition() class, with specific parameter descriptions as follows: @@ -1343,56 +1385,71 @@ Supports specifying device ID: str|None None + +engine +Meaning: Inference engine.
    Description: Supports None (the default), paddle, paddle_static, paddle_dynamic, and transformers. When left as None, PaddleOCR preserves the behavior of earlier versions, which in most configurations is equivalent to paddle. For detailed descriptions, supported values, compatibility rules, and examples, see Inference Engine and Configuration. +str|None +None + + +engine_config +Meaning: Inference-engine configuration.
    Description: Recommended together with engine. For supported fields, compatibility rules, and examples, see Inference Engine and Configuration. +dict|None +None + + enable_hpi -Meaning:Whether to enable high-performance inference. +Meaning: Whether to enable high-performance inference. bool -False +None use_tensorrt -Meaning:Whether to use the Paddle Inference TensorRT subgraph engine. If the model does not support acceleration through TensorRT, setting this flag will not enable acceleration.
    -Description: -For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6), and it is recommended to install TensorRT 8.6.1.6.
    - +Meaning: Whether to enable the TensorRT subgraph engine of Paddle Inference.
    +Description: +If the model does not support TensorRT acceleration, acceleration will not be used even if this flag is set.
    +For CUDA 11.8 versions of PaddlePaddle, the compatible TensorRT version is 8.x (x>=6). TensorRT 8.6.1.6 is recommended.
    bool False precision -Meaning:Computation precision, e.g., fp32, fp16. +Meaning: Computation precision, such as "fp32" or "fp16". str "fp32" enable_mkldnn -Meaning:Whether to enable MKL-DNN acceleration for inference.
    -Description: -If MKL-DNN is unavailable or the model does not support it, acceleration will not be used even if this flag is set. +Meaning: Whether to enable MKL-DNN accelerated inference.
    +Description: +If MKL-DNN is unavailable or the model does not support MKL-DNN acceleration, acceleration will not be used even if this flag is set. + bool True mkldnn_cache_capacity -Meaning:MKL-DNN cache capacity. +Meaning: MKL-DNN cache capacity. int 10 cpu_threads -Meaning:Number of threads used for inference on CPU. +Meaning: Number of threads used for inference on CPU. int -8 +10 paddlex_config -Meaning:Path to the PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file. str|None None + diff --git a/docs/version3.x/pipeline_usage/seal_recognition.md b/docs/version3.x/pipeline_usage/seal_recognition.md index 32114854480..f28b915b937 100644 --- a/docs/version3.x/pipeline_usage/seal_recognition.md +++ b/docs/version3.x/pipeline_usage/seal_recognition.md @@ -22,6 +22,7 @@ comments: true 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地推理引擎 `paddle_static`。
    版面区域检测模块(可选): @@ -665,7 +666,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理引擎与配置说明。 +str|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -989,14 +1010,13 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu 说明: 如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str fp32 @@ -1021,7 +1041,7 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1029,6 +1049,7 @@ paddleocr seal_recognition -i ./seal_text_det.png --device gpu str +
    @@ -1079,6 +1100,26 @@ for res in output: res.save_to_json("./output/") ``` +上述代码默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import SealRecognition + +pipeline = SealRecognition( + engine="transformers", +) +# ocr = SealRecognition(device="gpu") # 通过 device 指定模型推理时使用 GPU +output = pipeline.predict("./seal_text_det.png") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img("./output/") + res.save_to_json("./output/") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 在上述 Python 脚本中,执行了如下几个步骤: (1)通过 SealRecognition() 实例化 印章文本识别 产线对象,具体参数说明如下: @@ -1340,11 +1381,24 @@ for res in output: str|None None + +engine +含义:推理引擎。
    说明:支持 None(默认值)、paddlepaddle_staticpaddle_dynamictransformers。保持为默认值 None 时,PaddleOCR 保留旧版本的行为,在大多数配置下等价于 paddle。详细说明、取值、兼容性规则与示例请参见 推理引擎与配置说明。 +str|None +None + + +engine_config +含义:推理引擎配置。
    说明:推荐与 engine 搭配使用。详细字段、兼容性规则与示例请参见 推理引擎与配置说明。 +dict|None +None + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1352,14 +1406,13 @@ for res in output: 说明: 如果模型不支持通过 TensorRT 加速,即使设置了此标志,也不会使用加速。
    对于 CUDA 11.8 版本的飞桨,兼容的 TensorRT 版本为 8.x(x>=6),建议安装 TensorRT 8.6.1.6。
    - bool False precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -1384,7 +1437,7 @@ for res in output: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config @@ -1392,6 +1445,7 @@ for res in output: str|None None + diff --git a/docs/version3.x/pipeline_usage/table_recognition_v2.en.md b/docs/version3.x/pipeline_usage/table_recognition_v2.en.md index 5945b298e4d..be2301d6259 100644 --- a/docs/version3.x/pipeline_usage/table_recognition_v2.en.md +++ b/docs/version3.x/pipeline_usage/table_recognition_v2.en.md @@ -28,6 +28,7 @@ This pipeline is applicable in a variety of fields, including general, manufactu In this pipeline, you can choose the models to use based on the benchmark data below. > The inference time only includes the model inference time and does not include the time for pre- or post-processing. +> In the inference time columns labeled [Regular Mode / High-Performance Mode], the Regular Mode values correspond to the local `paddle_static` inference engine.
    Table Structure Recognition Module Models: @@ -806,7 +807,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">Inference Model/Inference Model/Inference Engine and Configuration. +str + + + enable_hpi Meaning:Whether to enable high-performance inference. bool -False +None use_tensorrt @@ -1186,7 +1205,7 @@ For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6) precision -Meaning:Computation precision, such as fp32, fp16. +Meaning:Computation precision, such as fp32, fp16. str fp32 @@ -1210,11 +1229,11 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no cpu_threads Meaning:Number of threads to use for inference on the CPU. int -8 +10 paddlex_config -Meaning:Path to PaddleX pipeline configuration file. +Meaning: Path to the PaddleX pipeline configuration file.
    Description: Use this parameter when you need to configure advanced options such as engine_config through a configuration file. See PaddleOCR and PaddleX. str @@ -1287,6 +1306,30 @@ for res in output: res.save_to_json("./output/") ``` +The example above uses the local `paddle_static` inference engine by default. To run it, first install PaddlePaddle by following [PaddlePaddle Framework Installation](../paddlepaddle_installation.en.md). + +If you choose `transformers` as the inference engine, make sure the Transformers environment is configured by following [Inference Engine and Configuration](../inference_engine.en.md), and then run the following code: + +```python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2( + engine="transformers", +) +# ocr = TableRecognitionPipelineV2(use_doc_orientation_classify=True) # Specify whether to use the document orientation classification model with use_doc_orientation_classify +# ocr = TableRecognitionPipelineV2(use_doc_unwarping=True) # Specify whether to use the text image unwarping module with use_doc_unwarping +# ocr = TableRecognitionPipelineV2(device="gpu") # Specify the device to use GPU for model inference +output = pipeline.predict("./table_recognition_v2.jpg") +for res in output: + res.print() ## Print the predicted structured output + res.save_to_img("./output/") + res.save_to_xlsx("./output/") + res.save_to_html("./output/") + res.save_to_json("./output/") +``` + +In most scenarios, the default `paddle_static` inference engine delivers better inference performance and is the recommended first choice. + In the above Python script, the following steps are performed: (1) Instantiate the general table recognition V2 pipeline object using TableRecognitionPipelineV2(). The specific parameter descriptions are as follows: @@ -1612,7 +1655,7 @@ Supports specifying a specific card number: enable_hpi Meaning:Whether to enable high-performance inference. bool -False +None use_tensorrt @@ -1626,7 +1669,7 @@ For Paddle with CUDA version 11.8, the compatible TensorRT version is 8.x (x>=6) precision -Meaning:Computation precision, such as fp32, fp16. +Meaning:Computation precision, such as "fp32", "fp16". str "fp32" @@ -1650,7 +1693,7 @@ If MKL-DNN is unavailable or the model does not support it, acceleration will no cpu_threads Meaning:Number of threads to use for inference on the CPU. int -8 +10 paddlex_config diff --git a/docs/version3.x/pipeline_usage/table_recognition_v2.md b/docs/version3.x/pipeline_usage/table_recognition_v2.md index f23755f506d..8f9ae0010d3 100644 --- a/docs/version3.x/pipeline_usage/table_recognition_v2.md +++ b/docs/version3.x/pipeline_usage/table_recognition_v2.md @@ -28,6 +28,7 @@ comments: true 在本产线中,您可以根据下方的基准测试数据选择使用的模型。 > 推理耗时仅包含模型推理耗时,不包含前后处理耗时。 +> 在带有 [常规模式 / 高性能模式] 标记的推理耗时列中,`常规模式` 对应本地推理引擎 `paddle_static`。
    表格结构识别模块模型: @@ -808,7 +809,7 @@ devanagari_PP-OCRv3_mobile_rec_infer.tar">推理模型/推理模型/推理引擎与配置说明。 +str + + + enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1196,7 +1215,7 @@ paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --device gpu precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 fp32fp16str fp32 @@ -1221,11 +1240,11 @@ paddleocr table_recognition_v2 -i ./table_recognition_v2.jpg --device gpu cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config -含义:PaddleX产线配置文件路径。 +含义:PaddleX产线配置文件路径。
    说明:如需通过配置文件设置 engine_config 等高级参数,可结合该参数使用。详见 PaddleOCR 与 PaddleXstr @@ -1297,6 +1316,30 @@ for res in output: res.save_to_json("./output/") ``` +上述代码默认使用本地推理引擎 `paddle_static`。如需运行,请先参考[飞桨框架安装说明](../paddlepaddle_installation.md)安装 PaddlePaddle。 + +如果选择 `transformers` 作为推理引擎,请先参考[推理引擎文档](../inference_engine.md)完成 Transformers 环境配置,然后执行如下代码: + +```python +from paddleocr import TableRecognitionPipelineV2 + +pipeline = TableRecognitionPipelineV2( + engine="transformers", +) +# ocr = TableRecognitionPipelineV2(use_doc_orientation_classify=True) # 通过 use_doc_orientation_classify 指定是否使用文档方向分类模型 +# ocr = TableRecognitionPipelineV2(use_doc_unwarping=True) # 通过 use_doc_unwarping 指定是否使用文本图像矫正模块 +# ocr = TableRecognitionPipelineV2(device="gpu") # 通过 device 指定模型推理时使用 GPU +output = pipeline.predict("./table_recognition_v2.jpg") +for res in output: + res.print() ## 打印预测的结构化输出 + res.save_to_img("./output/") + res.save_to_xlsx("./output/") + res.save_to_html("./output/") + res.save_to_json("./output/") +``` + +在大多数场景下,默认的 `paddle_static` 推理引擎通常具备更好的推理性能,建议优先使用。 + 在上述 Python 脚本中,执行了如下几个步骤: (1)通过 TableRecognitionPipelineV2() 实例化通用表格识别v2产线对象,具体参数说明如下: @@ -1603,7 +1646,7 @@ for res in output: enable_hpi 含义:是否启用高性能推理。 bool -False +None use_tensorrt @@ -1618,7 +1661,7 @@ for res in output: precision -含义:计算精度,如 fp32、fp16。 +含义:计算精度,如 "fp32""fp16"str "fp32" @@ -1643,7 +1686,7 @@ for res in output: cpu_threads 含义:在 CPU 上进行推理时使用的线程数。 int -8 +10 paddlex_config diff --git a/mkdocs.yml b/mkdocs.yml index 5dde49a4d00..7c17dfd0662 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -120,7 +120,7 @@ plugins: PaddleOCR-VL Apple Silicon 使用教程: PaddleOCR-VL Apple Silicon Usage Tutorial PaddleOCR-VL AMD GPU 使用教程: PaddleOCR-VL AMD GPU Usage Tutorial PaddleOCR-VL Intel Arc GPU 使用教程: PaddleOCR-VL Intel Arc GPU Usage Tutorial - 推理部署: Model Deploy + 推理部署: Inference and Deployment 高性能推理: High-Performance Inference 打包 PaddleOCR 项目: Package PaddleOCR Projects 获取ONNX模型: Obtaining ONNX Models @@ -149,6 +149,8 @@ plugins: 通用表格识别v2产线: General Table Recognition v2 Pipeline 图表解析模块: Chart Parsing Module PaddleOCR-VL产线: PaddleOCR-VL Pipeline + PP-DocTranslation产线: PP-DocTranslation Pipeline + 推理引擎与配置说明: Inference Engine and Configuration 多硬件使用: Multi-Devices Usage PaddleOCR 多硬件使用指南: PaddleOCR Multi-Devices Usage Guide 昇腾 NPU 飞桨安装教程: Ascend NPU PaddlePaddle Installation Tutorial @@ -184,7 +186,7 @@ plugins: 社区: Community 社区贡献: Community Contribution 附录: Appendix - 配置 PaddleOCR 推理包日志系统: Configure the logging system for the PaddleOCR inference package + 配置 paddleocr 包日志系统: Configure logging for the paddleocr Python package 模块概述: Module Overview 产线概述: Pipeline Overview 通用 OCR 产线 C++ 本地部署 - Linux: C++ Local Deployment for General OCR Pipeline - Linux @@ -336,6 +338,7 @@ nav: - 通用 OCR 产线 C++ 本地部署 - Linux: version3.x/deployment/cpp/OCR.md - 通用 OCR 产线 C++ 本地部署 - Windows: version3.x/deployment/cpp/OCR_windows.md - Benchmark: version3.x/pipeline_usage/instructions/benchmark.md + - 推理引擎与配置说明: version3.x/inference_engine.md - 模块列表: - 模块概述: version3.x/module_usage/module_overview.md @@ -368,7 +371,7 @@ nav: - 其他说明: - PaddleOCR 与 PaddleX: version3.x/paddleocr_and_paddlex.md - PaddleOCR 3.x 升级说明: update/upgrade_notes.md - - 配置 PaddleOCR 推理包日志系统: version3.x/logging.md + - 配置 paddleocr 包日志系统: version3.x/logging.md - 低代码全流程开发: - 概述: version3.x/paddlex/overview.md diff --git a/paddleocr/_common_args.py b/paddleocr/_common_args.py index 6054e5ae6ad..46bbee015d2 100644 --- a/paddleocr/_common_args.py +++ b/paddleocr/_common_args.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddlex.inference import PaddlePredictorOption from paddlex.utils.device import get_default_device, parse_device from ._constants import ( @@ -27,10 +26,19 @@ ) from ._utils.cli import str2bool +SUPPORTED_INFERENCE_ENGINE_LIST = [ + "paddle", + "paddle_static", + "paddle_dynamic", + "transformers", +] + def parse_common_args(kwargs, *, default_enable_hpi): default_vals = { "device": DEFAULT_DEVICE, + "engine": None, + "engine_config": None, "enable_hpi": default_enable_hpi, "use_tensorrt": DEFAULT_USE_TENSORRT, "precision": DEFAULT_PRECISION, @@ -46,6 +54,14 @@ def parse_common_args(kwargs, *, default_enable_hpi): kwargs = {**default_vals, **kwargs} + if ( + kwargs["engine"] is not None + and kwargs["engine"] not in SUPPORTED_INFERENCE_ENGINE_LIST + ): + raise ValueError( + f"Invalid engine: {kwargs['engine']}. Supported values are: {SUPPORTED_INFERENCE_ENGINE_LIST}." + ) + if kwargs["precision"] not in SUPPORTED_PRECISION_LIST: raise ValueError( f"Invalid precision: {kwargs['precision']}. Supported values are: {SUPPORTED_PRECISION_LIST}." @@ -57,39 +73,54 @@ def parse_common_args(kwargs, *, default_enable_hpi): return kwargs -def prepare_common_init_args(model_name, common_args): - device = common_args["device"] - if device is None: - device = get_default_device() - device_type, _ = parse_device(device) - - init_kwargs = {} - init_kwargs["device"] = device - init_kwargs["use_hpip"] = common_args["enable_hpi"] - - pp_option = PaddlePredictorOption() +def _build_paddle_static_engine_config(common_args, device_type): + cfg = {} if device_type == "gpu": if common_args["use_pptrt"]: if common_args["pptrt_precision"] == "fp32": - pp_option.run_mode = "trt_fp32" + cfg["run_mode"] = "trt_fp32" else: assert common_args["pptrt_precision"] == "fp16", common_args[ "pptrt_precision" ] - pp_option.run_mode = "trt_fp16" + cfg["run_mode"] = "trt_fp16" else: - pp_option.run_mode = "paddle" + cfg["run_mode"] = "paddle" elif device_type == "cpu": - enable_mkldnn = common_args["enable_mkldnn"] - if enable_mkldnn: - pp_option.mkldnn_cache_capacity = common_args["mkldnn_cache_capacity"] + if common_args["enable_mkldnn"]: + cfg["mkldnn_cache_capacity"] = common_args["mkldnn_cache_capacity"] else: - pp_option.run_mode = "paddle" - pp_option.cpu_threads = common_args["cpu_threads"] + cfg["run_mode"] = "paddle" + cfg["cpu_threads"] = common_args["cpu_threads"] else: - pp_option.run_mode = "paddle" - pp_option.enable_cinn = common_args["enable_cinn"] - init_kwargs["pp_option"] = pp_option + cfg["run_mode"] = "paddle" + cfg["enable_cinn"] = common_args["enable_cinn"] + return cfg + + +def prepare_common_init_args(model_name, common_args): + device = common_args["device"] + if device is None: + device = get_default_device() + device_type, _ = parse_device(device) + + init_kwargs = {} + init_kwargs["device"] = device + init_kwargs["engine"] = common_args["engine"] + init_kwargs["use_hpip"] = common_args["enable_hpi"] + + user_engine_config = common_args["engine_config"] + engine = common_args["engine"] + built = _build_paddle_static_engine_config(common_args, device_type) + + if user_engine_config is not None: + init_kwargs["engine_config"] = user_engine_config + elif engine == "paddle_static": + init_kwargs["engine_config"] = built + elif engine in (None, "paddle"): + init_kwargs["engine_config"] = {"paddle_static": built} + else: + init_kwargs["engine_config"] = None return init_kwargs @@ -105,6 +136,12 @@ def add_common_cli_opts(parser, *, default_enable_hpi, allow_multiple_devices): default=DEFAULT_DEVICE, help=help_, ) + parser.add_argument( + "--engine", + type=str, + choices=SUPPORTED_INFERENCE_ENGINE_LIST, + help="Inference engine to use. For CLI, engine-specific configuration should be set in the PaddleX YAML config file.", + ) parser.add_argument( "--enable_hpi", type=str2bool, diff --git a/pyproject.toml b/pyproject.toml index 0e8c49f7c0e..3e8b4bfdf98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,11 +63,11 @@ issues = "https://github.com/PaddlePaddle/PaddleOCR/issues" paddleocr = "paddleocr.__main__:console_entry" [project.optional-dependencies] -doc-parser = ["paddlex[ocr,genai-client]>=3.4.0,<3.5.0"] -ie = ["paddlex[ie]>=3.4.0,<3.5.0"] -trans = ["paddlex[trans]>=3.4.0,<3.5.0"] +doc-parser = ["paddlex[ocr,genai-client]>=3.5.0,<3.6.0"] +ie = ["paddlex[ie]>=3.5.0,<3.6.0"] +trans = ["paddlex[trans]>=3.5.0,<3.6.0"] doc2md = ["python-docx>=0.8.11", "python-pptx>=0.6.21", "openpyxl>=3.0.0", "pylatexenc>=2.10,<3"] -all = ["paddlex[ocr,genai-client,ie,trans]>=3.4.0,<3.5.0", "python-docx>=0.8.11", "python-pptx>=0.6.21", "openpyxl>=3.0.0"] +all = ["paddlex[ocr,genai-client,ie,trans]>=3.5.0,<3.6.0", "python-docx>=0.8.11", "python-pptx>=0.6.21", "openpyxl>=3.0.0"] [tool.setuptools.packages.find] where = ["."] diff --git a/readme/README_cn.md b/readme/README_cn.md index 270a64624bc..5347adfd914 100644 --- a/readme/README_cn.md +++ b/readme/README_cn.md @@ -141,6 +141,7 @@ PaddleOCR官方网站提供交互式**体验中心**和**APIs**——无需设 👉 [访问官方网站](https://www.paddleocr.com) ### 步骤 2: 本地部署 + 对于本地使用,请根据您的需求参考以下文档: - **PP-OCR系列**:查看[PP-OCR文档](https://www.paddleocr.ai/latest/version3.x/pipeline_usage/OCR.html)