diff --git a/paddleocr/_models/base.py b/paddleocr/_models/base.py index 2dd88bc0db9..7db41390445 100644 --- a/paddleocr/_models/base.py +++ b/paddleocr/_models/base.py @@ -28,6 +28,27 @@ class PaddleXPredictorWrapper(metaclass=abc.ABCMeta): + """Base class for single-model PaddleOCR wrappers. + + Subclasses wrap a PaddleX predictor for a specific model (e.g. text + detection, text recognition) and expose ``predict`` / ``predict_iter`` + methods along with optional CLI support. Each subclass must declare + ``default_model_name`` and ``get_cli_subcommand_executor``. + + Args: + model_name (str | None): Name of the model to load. Defaults to + ``default_model_name`` when ``None``. + model_dir (str | None): Local directory containing model files. + Downloads from the model hub when ``None``. + **common_args: Common inference arguments forwarded to PaddleX (e.g. + ``device``, ``use_hpip``, ``use_tensorrt``). + + Example: + >>> from paddleocr import TextDetection + >>> detector = TextDetection() + >>> results = detector.predict("image.png") + """ + def __init__( self, *, @@ -83,6 +104,12 @@ def _create_paddlex_predictor(self): class PredictorCLISubcommandExecutor(CLISubcommandExecutor): + """Base class for single-model CLI subcommand executors. + + Registers a model predictor as a subcommand of the ``paddleocr`` CLI and + handles argument parsing and execution. + """ + @property @abc.abstractmethod def subparser_name(self): diff --git a/paddleocr/_pipelines/base.py b/paddleocr/_pipelines/base.py index f400cab41e9..c1c6e101696 100644 --- a/paddleocr/_pipelines/base.py +++ b/paddleocr/_pipelines/base.py @@ -31,6 +31,7 @@ def _merge_dicts(d1, d2): + """Recursively merge d2 into d1, with d2 values taking precedence.""" res = d1.copy() for k, v in d2.items(): if k in res and isinstance(res[k], dict) and isinstance(v, dict): @@ -41,6 +42,7 @@ def _merge_dicts(d1, d2): def _to_builtin(obj): + """Recursively convert AttrDict and nested structures to plain Python dicts/lists.""" if isinstance(obj, AttrDict): return {k: _to_builtin(v) for k, v in obj.items()} elif isinstance(obj, dict): @@ -52,6 +54,25 @@ def _to_builtin(obj): class PaddleXPipelineWrapper(metaclass=abc.ABCMeta): + """Base class for PaddleOCR pipeline wrappers. + + Subclasses wrap a PaddleX pipeline and expose a simplified Python API + with optional CLI support. Each subclass must declare + ``_paddlex_pipeline_name`` and ``get_cli_subcommand_executor``. + + Args: + paddlex_config (str | dict | None): Path to a PaddleX pipeline YAML + config file, a pre-loaded config dict, or ``None`` to use the + default config for the pipeline. + **common_args: Common inference arguments forwarded to PaddleX (e.g. + ``device``, ``use_hpip``, ``use_tensorrt``). + + Example: + >>> from paddleocr import PaddleOCR + >>> ocr = PaddleOCR(lang="en") + >>> results = ocr.predict("image.png") + """ + def __init__( self, *, @@ -110,6 +131,12 @@ def _create_paddlex_pipeline(self): class PipelineCLISubcommandExecutor(CLISubcommandExecutor): + """Base class for pipeline CLI subcommand executors. + + Registers a pipeline as a subcommand of the ``paddleocr`` CLI and handles + argument parsing and execution. + """ + @property @abc.abstractmethod def subparser_name(self): diff --git a/paddleocr/_pipelines/ocr.py b/paddleocr/_pipelines/ocr.py index 151b0fd6118..e48dca675ce 100644 --- a/paddleocr/_pipelines/ocr.py +++ b/paddleocr/_pipelines/ocr.py @@ -53,6 +53,54 @@ # Be comptable with PaddleOCR 2.x interfaces class PaddleOCR(PaddleXPipelineWrapper): + """OCR pipeline that combines text detection and text recognition. + + Runs the full OCR pipeline: optional document preprocessing (orientation + classification and unwarping), text detection, optional text-line + orientation classification, and text recognition. + + Args: + lang (str | None): Language code for the input image (e.g. ``"ch"``, + ``"en"``, ``"fr"``). Used to select default detection and + recognition models when no explicit model name/dir is provided. + Defaults to ``"ch"`` when ``None``. + ocr_version (str | None): PP-OCR version to use when ``lang`` is set. + One of ``"PP-OCRv3"``, ``"PP-OCRv4"``, ``"PP-OCRv5"``. Defaults + to the latest available for the chosen language. + text_detection_model_name (str | None): Name of the text detection + model. Overrides ``lang``/``ocr_version`` selection. + text_detection_model_dir (str | None): Local directory for the text + detection model. + text_recognition_model_name (str | None): Name of the text recognition + model. Overrides ``lang``/``ocr_version`` selection. + text_recognition_model_dir (str | None): Local directory for the text + recognition model. + use_doc_orientation_classify (bool | None): Enable document orientation + classification preprocessing. + use_doc_unwarping (bool | None): Enable document unwarping + preprocessing. + use_textline_orientation (bool | None): Enable text-line orientation + classification. + text_det_thresh (float | None): Pixel-level detection threshold. + text_det_box_thresh (float | None): Box-level detection threshold. + text_det_unclip_ratio (float | None): Expansion ratio for detected + text bounding boxes. + text_rec_score_thresh (float | None): Minimum recognition confidence + to retain a result. + return_word_box (bool | None): Return per-word bounding boxes in + addition to line-level boxes. + **kwargs: Additional arguments forwarded to the base class (e.g. + ``device``, ``use_hpip``) or deprecated PaddleOCR 2.x parameter + names. + + Example: + >>> from paddleocr import PaddleOCR + >>> ocr = PaddleOCR(lang="en") + >>> results = ocr.predict("image.png") + >>> for res in results: + ... res.print() + """ + def __init__( self, doc_orientation_classify_model_name=None, @@ -181,6 +229,33 @@ def predict_iter( text_rec_score_thresh=None, return_word_box=None, ): + """Run OCR on ``input`` and yield one result object per image. + + Args: + input: Image path (str), URL, numpy array, PIL Image, or an + iterable of any of the above. + use_doc_orientation_classify (bool | None): Override the + constructor setting for this call. + use_doc_unwarping (bool | None): Override the constructor setting + for this call. + use_textline_orientation (bool | None): Override the constructor + setting for this call. + text_det_limit_side_len (int | None): Maximum side length for + text detection input resizing. + text_det_limit_type (str | None): How to apply the side-length + limit (``"max"`` or ``"min"``). + text_det_thresh (float | None): Override detection pixel threshold. + text_det_box_thresh (float | None): Override detection box + threshold. + text_det_unclip_ratio (float | None): Override box expansion ratio. + text_rec_score_thresh (float | None): Override recognition + confidence threshold. + return_word_box (bool | None): Override per-word box setting. + + Yields: + PaddleX OCR result objects with ``.print()``, ``.save_to_img()``, + and ``.save_to_json()`` methods. + """ return self.paddlex_pipeline.predict( input, use_doc_orientation_classify=use_doc_orientation_classify, @@ -210,6 +285,15 @@ def predict( text_rec_score_thresh=None, return_word_box=None, ): + """Run OCR on ``input`` and return a list of result objects. + + Convenience wrapper around :meth:`predict_iter` that collects all + results into a list. See :meth:`predict_iter` for the full parameter + documentation. + + Returns: + list: One result object per input image. + """ return list( self.predict_iter( input, diff --git a/paddleocr/_pipelines/paddleocr_vl.py b/paddleocr/_pipelines/paddleocr_vl.py index 9e1fc9b0f73..444fb3368ef 100644 --- a/paddleocr/_pipelines/paddleocr_vl.py +++ b/paddleocr/_pipelines/paddleocr_vl.py @@ -35,6 +35,49 @@ class PaddleOCRVL(PaddleXPipelineWrapper): + """Vision-Language document understanding pipeline (PaddleOCR-VL). + + Uses a compact Vision-Language Model (VLM) — PaddleOCR-VL-1.5 (0.9 B + parameters by default) — to parse document images into structured + Markdown. Supports 109+ languages and handles challenging real-world + conditions such as skew, warping, scanning artifacts, and uneven + illumination. + + Args: + pipeline_version (str): VL pipeline version. ``"v1.5"`` (default) + uses PaddleOCR-VL-1.5; ``"v1"`` uses the earlier PaddleOCR-VL. + vl_rec_model_name (str | None): Name of the VL recognition model. + vl_rec_model_dir (str | None): Local directory for the VL model. + vl_rec_backend (str | None): Inference backend for the VL model. + One of ``"native"`` (default), ``"vllm-server"``, + ``"sglang-server"``, ``"fastdeploy-server"``, + ``"mlx-vlm-server"``, ``"llama-cpp-server"``. + vl_rec_server_url (str | None): Server URL when using a server + backend. + vl_rec_api_key (str | None): API key for API-based backends. + layout_detection_model_name (str | None): Name of the layout + detection model. + layout_detection_model_dir (str | None): Local directory for the + layout detection model. + use_doc_orientation_classify (bool | None): Enable document + orientation classification preprocessing. + use_doc_unwarping (bool | None): Enable document unwarping + preprocessing. + use_layout_detection (bool | None): Enable layout detection. + use_chart_recognition (bool | None): Enable chart parsing. + use_seal_recognition (bool | None): Enable seal text recognition. + use_ocr_for_image_block (bool | None): Run OCR on image blocks. + **kwargs: Additional arguments forwarded to the base class (e.g. + ``device``, ``use_hpip``). + + Example: + >>> from paddleocr import PaddleOCRVL + >>> pipeline = PaddleOCRVL() + >>> results = pipeline.predict("document.png") + >>> for res in results: + ... print(res.markdown) + """ + def __init__( self, pipeline_version=_DEFAULT_PIPELINE_VERSION, diff --git a/paddleocr/_pipelines/pp_structurev3.py b/paddleocr/_pipelines/pp_structurev3.py index eefe7d1c851..995af1a683f 100644 --- a/paddleocr/_pipelines/pp_structurev3.py +++ b/paddleocr/_pipelines/pp_structurev3.py @@ -29,6 +29,44 @@ class PPStructureV3(PaddleXPipelineWrapper): + """Document structure analysis pipeline (PP-StructureV3). + + Parses complex documents into structured Markdown or JSON by combining + layout detection, OCR, table recognition, formula recognition, chart + recognition, and seal recognition sub-pipelines. + + Args: + lang (str | None): Language code for OCR sub-pipelines (e.g. + ``"ch"``, ``"en"``). Defaults to ``"ch"`` when ``None``. + ocr_version (str | None): PP-OCR version for text detection/ + recognition models. One of ``"PP-OCRv3"``, ``"PP-OCRv4"``, + ``"PP-OCRv5"``. + layout_detection_model_name (str | None): Name of the layout + detection model. + layout_detection_model_dir (str | None): Local directory for the + layout detection model. + use_doc_orientation_classify (bool | None): Enable document + orientation classification. + use_doc_unwarping (bool | None): Enable document unwarping. + use_textline_orientation (bool | None): Enable text-line orientation + classification. + use_seal_recognition (bool | None): Enable seal text recognition. + use_table_recognition (bool | None): Enable table structure + recognition. + use_formula_recognition (bool | None): Enable formula recognition. + use_chart_recognition (bool | None): Enable chart parsing. + use_region_detection (bool | None): Enable region detection. + **kwargs: Additional arguments forwarded to the base class (e.g. + ``device``, ``use_hpip``). + + Example: + >>> from paddleocr import PPStructureV3 + >>> pipeline = PPStructureV3() + >>> results = pipeline.predict("document.pdf") + >>> for res in results: + ... print(res.markdown) + """ + def __init__( self, layout_detection_model_name=None,