Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions paddleocr/_models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,27 @@


class PaddleXPredictorWrapper(metaclass=abc.ABCMeta):
"""Base class for single-model PaddleOCR wrappers.

Subclasses wrap a PaddleX predictor for a specific model (e.g. text
detection, text recognition) and expose ``predict`` / ``predict_iter``
methods along with optional CLI support. Each subclass must declare
``default_model_name`` and ``get_cli_subcommand_executor``.

Args:
model_name (str | None): Name of the model to load. Defaults to
``default_model_name`` when ``None``.
model_dir (str | None): Local directory containing model files.
Downloads from the model hub when ``None``.
**common_args: Common inference arguments forwarded to PaddleX (e.g.
``device``, ``use_hpip``, ``use_tensorrt``).

Example:
>>> from paddleocr import TextDetection
>>> detector = TextDetection()
>>> results = detector.predict("image.png")
"""

def __init__(
self,
*,
Expand Down Expand Up @@ -83,6 +104,12 @@ def _create_paddlex_predictor(self):


class PredictorCLISubcommandExecutor(CLISubcommandExecutor):
"""Base class for single-model CLI subcommand executors.

Registers a model predictor as a subcommand of the ``paddleocr`` CLI and
handles argument parsing and execution.
"""

@property
@abc.abstractmethod
def subparser_name(self):
Expand Down
27 changes: 27 additions & 0 deletions paddleocr/_pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@


def _merge_dicts(d1, d2):
"""Recursively merge d2 into d1, with d2 values taking precedence."""
res = d1.copy()
for k, v in d2.items():
if k in res and isinstance(res[k], dict) and isinstance(v, dict):
Expand All @@ -41,6 +42,7 @@ def _merge_dicts(d1, d2):


def _to_builtin(obj):
"""Recursively convert AttrDict and nested structures to plain Python dicts/lists."""
if isinstance(obj, AttrDict):
return {k: _to_builtin(v) for k, v in obj.items()}
elif isinstance(obj, dict):
Expand All @@ -52,6 +54,25 @@ def _to_builtin(obj):


class PaddleXPipelineWrapper(metaclass=abc.ABCMeta):
"""Base class for PaddleOCR pipeline wrappers.

Subclasses wrap a PaddleX pipeline and expose a simplified Python API
with optional CLI support. Each subclass must declare
``_paddlex_pipeline_name`` and ``get_cli_subcommand_executor``.

Args:
paddlex_config (str | dict | None): Path to a PaddleX pipeline YAML
config file, a pre-loaded config dict, or ``None`` to use the
default config for the pipeline.
**common_args: Common inference arguments forwarded to PaddleX (e.g.
``device``, ``use_hpip``, ``use_tensorrt``).

Example:
>>> from paddleocr import PaddleOCR
>>> ocr = PaddleOCR(lang="en")
>>> results = ocr.predict("image.png")
"""

def __init__(
self,
*,
Expand Down Expand Up @@ -110,6 +131,12 @@ def _create_paddlex_pipeline(self):


class PipelineCLISubcommandExecutor(CLISubcommandExecutor):
"""Base class for pipeline CLI subcommand executors.

Registers a pipeline as a subcommand of the ``paddleocr`` CLI and handles
argument parsing and execution.
"""

@property
@abc.abstractmethod
def subparser_name(self):
Expand Down
84 changes: 84 additions & 0 deletions paddleocr/_pipelines/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,54 @@

# Be comptable with PaddleOCR 2.x interfaces
class PaddleOCR(PaddleXPipelineWrapper):
"""OCR pipeline that combines text detection and text recognition.

Runs the full OCR pipeline: optional document preprocessing (orientation
classification and unwarping), text detection, optional text-line
orientation classification, and text recognition.

Args:
lang (str | None): Language code for the input image (e.g. ``"ch"``,
``"en"``, ``"fr"``). Used to select default detection and
recognition models when no explicit model name/dir is provided.
Defaults to ``"ch"`` when ``None``.
ocr_version (str | None): PP-OCR version to use when ``lang`` is set.
One of ``"PP-OCRv3"``, ``"PP-OCRv4"``, ``"PP-OCRv5"``. Defaults
to the latest available for the chosen language.
text_detection_model_name (str | None): Name of the text detection
model. Overrides ``lang``/``ocr_version`` selection.
text_detection_model_dir (str | None): Local directory for the text
detection model.
text_recognition_model_name (str | None): Name of the text recognition
model. Overrides ``lang``/``ocr_version`` selection.
text_recognition_model_dir (str | None): Local directory for the text
recognition model.
use_doc_orientation_classify (bool | None): Enable document orientation
classification preprocessing.
use_doc_unwarping (bool | None): Enable document unwarping
preprocessing.
use_textline_orientation (bool | None): Enable text-line orientation
classification.
text_det_thresh (float | None): Pixel-level detection threshold.
text_det_box_thresh (float | None): Box-level detection threshold.
text_det_unclip_ratio (float | None): Expansion ratio for detected
text bounding boxes.
text_rec_score_thresh (float | None): Minimum recognition confidence
to retain a result.
return_word_box (bool | None): Return per-word bounding boxes in
addition to line-level boxes.
**kwargs: Additional arguments forwarded to the base class (e.g.
``device``, ``use_hpip``) or deprecated PaddleOCR 2.x parameter
names.

Example:
>>> from paddleocr import PaddleOCR
>>> ocr = PaddleOCR(lang="en")
>>> results = ocr.predict("image.png")
>>> for res in results:
... res.print()
"""

def __init__(
self,
doc_orientation_classify_model_name=None,
Expand Down Expand Up @@ -181,6 +229,33 @@ def predict_iter(
text_rec_score_thresh=None,
return_word_box=None,
):
"""Run OCR on ``input`` and yield one result object per image.

Args:
input: Image path (str), URL, numpy array, PIL Image, or an
iterable of any of the above.
use_doc_orientation_classify (bool | None): Override the
constructor setting for this call.
use_doc_unwarping (bool | None): Override the constructor setting
for this call.
use_textline_orientation (bool | None): Override the constructor
setting for this call.
text_det_limit_side_len (int | None): Maximum side length for
text detection input resizing.
text_det_limit_type (str | None): How to apply the side-length
limit (``"max"`` or ``"min"``).
text_det_thresh (float | None): Override detection pixel threshold.
text_det_box_thresh (float | None): Override detection box
threshold.
text_det_unclip_ratio (float | None): Override box expansion ratio.
text_rec_score_thresh (float | None): Override recognition
confidence threshold.
return_word_box (bool | None): Override per-word box setting.

Yields:
PaddleX OCR result objects with ``.print()``, ``.save_to_img()``,
and ``.save_to_json()`` methods.
"""
return self.paddlex_pipeline.predict(
input,
use_doc_orientation_classify=use_doc_orientation_classify,
Expand Down Expand Up @@ -210,6 +285,15 @@ def predict(
text_rec_score_thresh=None,
return_word_box=None,
):
"""Run OCR on ``input`` and return a list of result objects.

Convenience wrapper around :meth:`predict_iter` that collects all
results into a list. See :meth:`predict_iter` for the full parameter
documentation.

Returns:
list: One result object per input image.
"""
return list(
self.predict_iter(
input,
Expand Down
43 changes: 43 additions & 0 deletions paddleocr/_pipelines/paddleocr_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,49 @@


class PaddleOCRVL(PaddleXPipelineWrapper):
"""Vision-Language document understanding pipeline (PaddleOCR-VL).

Uses a compact Vision-Language Model (VLM) — PaddleOCR-VL-1.5 (0.9 B
parameters by default) — to parse document images into structured
Markdown. Supports 109+ languages and handles challenging real-world
conditions such as skew, warping, scanning artifacts, and uneven
illumination.

Args:
pipeline_version (str): VL pipeline version. ``"v1.5"`` (default)
uses PaddleOCR-VL-1.5; ``"v1"`` uses the earlier PaddleOCR-VL.
vl_rec_model_name (str | None): Name of the VL recognition model.
vl_rec_model_dir (str | None): Local directory for the VL model.
vl_rec_backend (str | None): Inference backend for the VL model.
One of ``"native"`` (default), ``"vllm-server"``,
``"sglang-server"``, ``"fastdeploy-server"``,
``"mlx-vlm-server"``, ``"llama-cpp-server"``.
vl_rec_server_url (str | None): Server URL when using a server
backend.
vl_rec_api_key (str | None): API key for API-based backends.
layout_detection_model_name (str | None): Name of the layout
detection model.
layout_detection_model_dir (str | None): Local directory for the
layout detection model.
use_doc_orientation_classify (bool | None): Enable document
orientation classification preprocessing.
use_doc_unwarping (bool | None): Enable document unwarping
preprocessing.
use_layout_detection (bool | None): Enable layout detection.
use_chart_recognition (bool | None): Enable chart parsing.
use_seal_recognition (bool | None): Enable seal text recognition.
use_ocr_for_image_block (bool | None): Run OCR on image blocks.
**kwargs: Additional arguments forwarded to the base class (e.g.
``device``, ``use_hpip``).

Example:
>>> from paddleocr import PaddleOCRVL
>>> pipeline = PaddleOCRVL()
>>> results = pipeline.predict("document.png")
>>> for res in results:
... print(res.markdown)
"""

def __init__(
self,
pipeline_version=_DEFAULT_PIPELINE_VERSION,
Expand Down
38 changes: 38 additions & 0 deletions paddleocr/_pipelines/pp_structurev3.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,44 @@


class PPStructureV3(PaddleXPipelineWrapper):
"""Document structure analysis pipeline (PP-StructureV3).

Parses complex documents into structured Markdown or JSON by combining
layout detection, OCR, table recognition, formula recognition, chart
recognition, and seal recognition sub-pipelines.

Args:
lang (str | None): Language code for OCR sub-pipelines (e.g.
``"ch"``, ``"en"``). Defaults to ``"ch"`` when ``None``.
ocr_version (str | None): PP-OCR version for text detection/
recognition models. One of ``"PP-OCRv3"``, ``"PP-OCRv4"``,
``"PP-OCRv5"``.
layout_detection_model_name (str | None): Name of the layout
detection model.
layout_detection_model_dir (str | None): Local directory for the
layout detection model.
use_doc_orientation_classify (bool | None): Enable document
orientation classification.
use_doc_unwarping (bool | None): Enable document unwarping.
use_textline_orientation (bool | None): Enable text-line orientation
classification.
use_seal_recognition (bool | None): Enable seal text recognition.
use_table_recognition (bool | None): Enable table structure
recognition.
use_formula_recognition (bool | None): Enable formula recognition.
use_chart_recognition (bool | None): Enable chart parsing.
use_region_detection (bool | None): Enable region detection.
**kwargs: Additional arguments forwarded to the base class (e.g.
``device``, ``use_hpip``).

Example:
>>> from paddleocr import PPStructureV3
>>> pipeline = PPStructureV3()
>>> results = pipeline.predict("document.pdf")
>>> for res in results:
... print(res.markdown)
"""

def __init__(
self,
layout_detection_model_name=None,
Expand Down
Loading