[Feat] Add CLI support for OCR (#2058)

simont2k · web-flow · commit db9d92a72c45 · 2026-05-07T11:01:52.000+02:00
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -83,6 +83,7 @@ Supported datasets
    using_doctr/using_model_export
    using_doctr/custom_models_training
    using_doctr/running_on_aws
+   using_doctr/using_cli
 
 
 .. toctree::
diff --git a/docs/source/using_doctr/using_cli.rst b/docs/source/using_doctr/using_cli.rst
@@ -0,0 +1,73 @@
+Using the CLI for Optical Character Recognition
+===============================================
+
+The full Optical Character Recognition (OCR) task can be executed by using the Command Line Interface (CLI) implemented in docTR. This tool allows you to process both images and PDF files without writing a single line of Python code, providing a streamlined way to export OCR results directly to JSON.
+
+Basic Usage
+-----------
+
+To run the OCR engine on a file, use the following command structure:
+
+.. code-block:: bash
+
+    doctr-cli --input_path path/to/your/document.pdf --output results.json
+
+Arguments
+---------
+
+The CLI supports a variety of arguments to fine-tune the detection and recognition process:
+
+**Mandatory Arguments:**
+
+* ``--input_path``: Path to the input image or PDF file you wish to process.
+
+**Architecture Selection:**
+
+* ``--det_arch``: The detection architecture / model to use (e.g., ``db_resnet50``). *Default: db_resnet50*
+* ``--reco_arch``: The recognition architecture / model to use (e.g., ``crnn_vgg16_bn``). *Default: crnn_vgg16_bn*
+
+**Processing Options:**
+
+* ``--assume_straight_pages``, ``--no-assume_straight_pages``: Determine whether pages should be handled as straight or skewed pages. *Default: True*
+* ``--straighten_pages``: If flagged, the tool will attempt to straighten skewed pages before analysis. *Default: True*
+* ``--preserve_aspect_ratio``, ``--no-preserve_aspect_ratio``: Ensures that the aspect ratio is maintained during resizing. *Default: True*
+* ``--symmetric_pad``: Applies symmetric padding to the input images. *Default: True*
+* ``--det_bs``: Batch size used for the detection model. *Default: 2*
+* ``--reco_bs``: Batch size used for the recognition model. *Default: 128*
+* ``--detect_orientation``: Enables automatic detection of page orientation. *Default: False*
+* ``--detect_language``: Enables language detection for the extracted text. *Default: False*
+
+**Output Options:**
+
+* ``--output``: The destination path where the JSON results will be saved. *Default: results.json*
+
+Examples
+--------
+
+**Running OCR on an image:**
+
+.. code-block:: bash
+
+    doctr-cli --input_path image.jpg --output ocr_res.json
+
+**Running OCR on a PDF:**
+
+.. code-block:: bash
+
+    doctr-cli --input_path image.pdf --output ocr_res.json
+
+**Using a specific detection architecture and straightening pages:**
+
+.. code-block:: bash
+
+    doctr-cli --input_path doc.pdf --det_arch db_mobilenet_v3_large --straighten_pages
+
+Output Format
+-------------
+
+The results are exported in a structured JSON format containing:
+
+* **Pages**: Dimensions and orientation.
+* **Blocks**: Grouping of lines.
+* **Lines**: Grouping of words.
+* **Words**: The actual text content with confidence scores and bounding box coordinates.
diff --git a/doctr/cli/__init__.py b/doctr/cli/__init__.py
diff --git a/doctr/cli/main.py b/doctr/cli/main.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2021-2026, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import argparse
+import json
+import logging
+import sys
+
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+
+logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
+
+
+def main(argv=None):
+    """Main function for the docTR CLI tool"""
+    # parse command-line arguments and set up the model
+    args = _parse_args(argv)
+    model = ocr_predictor(
+        det_arch=args.det_arch,
+        reco_arch=args.reco_arch,
+        pretrained=True,
+        assume_straight_pages=args.assume_straight_pages,
+        preserve_aspect_ratio=args.preserve_aspect_ratio,
+        symmetric_pad=args.symmetric_pad,
+        detect_orientation=args.detect_orientation,
+        straighten_pages=args.straighten_pages,
+        detect_language=args.detect_language,
+        det_bs=args.det_bs,
+        reco_bs=args.reco_bs,
+    )
+
+    # load the document
+    try:
+        if args.input_path.lower().endswith(".pdf"):
+            doc = DocumentFile.from_pdf(args.input_path)
+        else:
+            doc = DocumentFile.from_images(args.input_path)
+        logging.info(f"Document loaded successfully from {args.input_path}")
+    except FileNotFoundError:
+        logging.error(f"File not found: {args.input_path}")
+        sys.exit(1)
+    except ValueError:
+        logging.error(f"File could not be read as a valid image or PDF: {args.input_path}")
+        sys.exit(1)
+    except Exception as e:
+        logging.error(f"Error occurred while loading the document: {e}")
+        sys.exit(1)
+
+    # perform OCR
+    logging.info("Performing OCR...")
+    result = model(doc)
+
+    # save results to JSON file
+    try:
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(result.export(), f, indent=4, ensure_ascii=False)
+        logging.info(f"Results saved to {args.output}")
+    except FileNotFoundError:
+        logging.error(f"Could not write output file at given path: {args.output}")
+        sys.exit(1)
+    except Exception as e:
+        logging.error(f"Results could not be saved: {e}")
+        sys.exit(1)
+
+
+def _parse_args(argv=None):
+    parser = argparse.ArgumentParser(
+        description="docTR CLI tool for OCR prediction on images and PDFs",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # required input path
+    parser.add_argument("--input_path", type=str, required=True, help="path to input image or PDF file")
+
+    # architecture selection
+    parser.add_argument(
+        "--det_arch",
+        type=str,
+        default="db_resnet50",
+        help="name of the detection architecture or the model itself to use",
+    )
+    parser.add_argument(
+        "--reco_arch",
+        type=str,
+        default="crnn_vgg16_bn",
+        help="name of the recognition architecture or the model itself to use",
+    )
+
+    # processing options
+    parser.add_argument(
+        "--assume_straight_pages",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="assume only straight pages without rotated textual elements",
+    )
+    parser.add_argument(
+        "--straighten_pages", action="store_true", help="attempt to straighten skewed pages before analysis"
+    )
+    parser.add_argument(
+        "--preserve_aspect_ratio",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="preserve aspect ratio when resizing pages",
+    )
+    parser.add_argument("--symmetric_pad", action="store_true", help="apply symmetric padding")
+    parser.add_argument("--det_bs", type=int, default=2, help="batch size for detection")
+    parser.add_argument("--reco_bs", type=int, default=128, help="batch size for recognition")
+    parser.add_argument("--detect_orientation", action="store_true", help="automatically detect page orientation")
+    parser.add_argument("--detect_language", action="store_true", help="detect language of the text")
+
+    # output options
+    parser.add_argument("--output", type=str, default="results.json", help="path to output results in JSON format")
+
+    return parser.parse_args(argv)
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,9 @@ dependencies = [
     "tqdm>=4.30.0",
 ]
 
+[project.scripts]
+doctr-cli = "doctr.cli.main:main"
+
 [project.optional-dependencies]
 html = [
     "weasyprint>=55.0",
diff --git a/tests/common/test_cli.py b/tests/common/test_cli.py
@@ -0,0 +1,110 @@
+from pathlib import Path
+
+import pytest
+
+import doctr.cli.main as cli
+
+
+def test_parse_args_defaults():
+    args = cli._parse_args(["--input_path", "sample.pdf"])
+
+    assert args.input_path == "sample.pdf"
+    assert args.det_arch == "db_resnet50"
+    assert args.reco_arch == "crnn_vgg16_bn"
+    assert args.assume_straight_pages is True
+    assert args.preserve_aspect_ratio is True
+    assert args.symmetric_pad is False
+    assert args.det_bs == 2
+    assert args.reco_bs == 128
+    assert args.detect_orientation is False
+    assert args.detect_language is False
+
+
+def test_parse_args_boolean_optional_flags():
+    args = cli._parse_args([
+        "--input_path",
+        "sample.pdf",
+        "--no-assume_straight_pages",
+        "--no-preserve_aspect_ratio",
+    ])
+
+    assert args.assume_straight_pages is False
+    assert args.preserve_aspect_ratio is False
+
+
+def test_parse_args_requires_input_path():
+    with pytest.raises(SystemExit):
+        cli._parse_args([])
+
+
+def test_parse_args_custom_values():
+    args = cli._parse_args([
+        "--input_path",
+        "sample.pdf",
+        "--det_arch",
+        "custom_det",
+        "--reco_arch",
+        "custom_reco",
+        "--symmetric_pad",
+        "--detect_orientation",
+        "--detect_language",
+        "--output",
+        "output.json",
+    ])
+
+    assert args.input_path == "sample.pdf"
+    assert args.det_arch == "custom_det"
+    assert args.reco_arch == "custom_reco"
+    assert args.symmetric_pad is True
+    assert args.detect_orientation is True
+    assert args.detect_language is True
+    assert args.output == "output.json"
+
+
+def test_main_with_image(mock_image_path):
+    output_path = "results.json"
+    cli.main(["--input_path", mock_image_path, "--output", output_path])
+
+    assert Path(output_path).exists()
+
+
+def test_main_with_pdf(mock_pdf):
+    output_path = "results.json"
+    cli.main(["--input_path", mock_pdf, "--output", output_path])
+
+    assert Path(output_path).exists()
+
+
+def test_main_no_input_path():
+    with pytest.raises(SystemExit):
+        cli.main([])
+
+
+def test_main_invalid_input_path():
+    with pytest.raises(SystemExit):
+        cli.main(["--input_path", "non_existent_file.pdf", "--output", "results.json"])
+
+
+def test_main_unsupported_input_file_format(tmp_path):
+    unsupported_file = tmp_path / "unsupported.txt"
+    unsupported_file.write_text("This is not a valid image or PDF file.")
+    with pytest.raises(SystemExit):
+        cli.main(["--input_path", str(unsupported_file), "--output", "results.json"])
+
+
+def test_main_corrupted_input_file(tmp_path):
+    corrupted_pdf = tmp_path / "corrupted.pdf"
+    corrupted_pdf.write_text("not a real pdf")
+
+    with pytest.raises(SystemExit):
+        cli.main(["--input_path", str(corrupted_pdf), "--output", "results.json"])
+
+
+def test_main_output_path_not_a_file(mock_image_path):
+    with pytest.raises(SystemExit):
+        cli.main(["--input_path", mock_image_path, "--output", "."])
+
+
+def test_main_output_path_invalid_directory(mock_image_path):
+    with pytest.raises(SystemExit):
+        cli.main(["--input_path", mock_image_path, "--output", "non_existent_dir/results.json"])

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,9 @@ dependencies = [`
`54`	`54`	`"tqdm>=4.30.0",`
`55`	`55`	`]`
`56`	`56`
	`57`	`+[project.scripts]`
	`58`	`+doctr-cli = "doctr.cli.main:main"`
	`59`	`+`
`57`	`60`	`[project.optional-dependencies]`
`58`	`61`	`html = [`
`59`	`62`	`"weasyprint>=55.0",`