docling-project
diff --git a/‎.github/CI_LABELS.md‎
Lines changed: 39 additions & 0 deletions b/‎.github/CI_LABELS.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.github/actions/setup-ubuntu-ci/action.yml‎
Lines changed: 77 additions & 0 deletions b/‎.github/actions/setup-ubuntu-ci/action.yml‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎.github/scripts/check_needs_results.py‎
Lines changed: 70 additions & 0 deletions b/‎.github/scripts/check_needs_results.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎.github/scripts/pytest_marker_selection.py‎
Lines changed: 191 additions & 0 deletions b/‎.github/scripts/pytest_marker_selection.py‎
Lines changed: 191 additions & 0 deletions
@@ -0,0 +1,39 @@
+# CI labels
+
+The pull request workflows recognize these optional maintainer labels:
+
+- `tests:full`: run the full Linux CI matrix for the PR, including all ML
+  suites and package compatibility lanes.
+- `tests:heavy-examples`: run the heavy examples workflow for the PR.
+
+Windows and macOS smoke lanes are intentionally not label-triggered. Run them
+from the `Run CI` or `Run CI Main` workflow dispatch inputs when cross-platform
+verification is needed.
+
+## ML test segmentation
+
+Expensive ML tests are selected with module-level pytest markers, not workflow
+file globs:
+
+- `pytest.mark.ml_ocr`
+- `pytest.mark.ml_pdf_model`
+- `pytest.mark.ml_vlm`
+- `pytest.mark.ml_asr`
+
+New tests run in the core lane by default. If a new test belongs in an ML lane,
+add the matching module-level `pytestmark`; do not add per-test file globs to
+the workflow.
+
+The workflow intentionally uses a broad ML trigger for code, test, and tooling
+changes. Tach performs the fine-grained affected-test selection inside the ML
+lanes.
+
+Path filters still decide whether a CI lane should be created at all. Pytest
+markers only select which test modules run after a test lane has started.
+
+## Cross-platform smoke tests
+
+Windows and macOS smoke tests are selected with `pytest.mark.cross_platform`.
+Use this marker for lightweight modules that should be exercised by the
+workflow-dispatch cross-platform lanes; do not maintain a separate test-file
+list in the workflow.
@@ -0,0 +1,77 @@
+name: Setup Ubuntu CI
+description: Set up Python, uv, and optional Docling CI dependencies on Ubuntu.
+
+inputs:
+  python_version:
+    description: Python version passed to astral-sh/setup-uv.
+    required: true
+  enable_cache:
+    description: Whether to enable the uv cache.
+    default: "true"
+  uv_sync_args:
+    description: Arguments passed to `uv sync`. Leave empty to skip syncing.
+    default: ""
+  install_system_deps:
+    description: Whether to install Ubuntu OCR and office dependencies.
+    default: "false"
+  cache_models:
+    description: Whether to restore and save the shared model cache.
+    default: "false"
+
+runs:
+  using: composite
+  steps:
+    - name: Grant permissions to APT cache directory # allows restore
+      if: ${{ inputs.install_system_deps == 'true' }}
+      shell: bash
+      run: sudo chown -R "$USER:$USER" /var/cache/apt/archives
+
+    - name: Cache APT packages
+      if: ${{ inputs.install_system_deps == 'true' }}
+      uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+      with:
+        path: /var/cache/apt/archives
+        key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/actions/setup-ubuntu-ci/action.yml') }}
+        restore-keys: |
+          apt-packages-${{ runner.os }}-
+
+    - name: Install system dependencies
+      if: ${{ inputs.install_system_deps == 'true' }}
+      shell: bash
+      run: |
+        sudo apt-get -qq update
+        sudo apt-get -qq install -y build-essential ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
+
+    - name: Set TESSDATA_PREFIX
+      if: ${{ inputs.install_system_deps == 'true' }}
+      shell: bash
+      run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
+
+    - name: Install uv and set the python version
+      uses: astral-sh/setup-uv@v7
+      with:
+        python-version: ${{ inputs.python_version }}
+        enable-cache: ${{ inputs.enable_cache }}
+
+    - name: Install Python dependencies
+      if: ${{ inputs.uv_sync_args != '' }}
+      shell: bash
+      run: uv sync ${{ inputs.uv_sync_args }}
+
+    - name: Cache models
+      if: ${{ inputs.cache_models == 'true' }}
+      uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+      with:
+        path: |
+          ~/.cache/huggingface
+          ~/.cache/modelscope
+          ~/.EasyOCR/
+        key: models-cache-${{ runner.os }}-${{ hashFiles('uv.lock', 'pyproject.toml') }}
+        restore-keys: |
+          models-cache-${{ runner.os }}-
+          models-cache-
+
+    - name: Grant permissions to APT cache directory # allows backup
+      if: ${{ inputs.install_system_deps == 'true' }}
+      shell: bash
+      run: sudo chown -R "$USER:$USER" /var/cache/apt/archives
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import argparse
+import json
+from typing import Any
+
+SUCCESS = "success"
+SKIPPED = "skipped"
+
+
+def parse_allowed_skips(raw_allowed_skips: str) -> set[str]:
+    return {job for job in raw_allowed_skips.split() if job}
+
+
+def parse_needs(raw_needs: str) -> dict[str, Any]:
+    loaded = json.loads(raw_needs)
+    if not isinstance(loaded, dict):
+        msg = "--needs-json must decode to a JSON object."
+        raise ValueError(msg)
+    return loaded
+
+
+def result_for_job(job: str, value: Any) -> str:
+    if not isinstance(value, dict):
+        msg = f"{job}: needs entry must be a JSON object."
+        raise ValueError(msg)
+
+    result = value.get("result")
+    if not isinstance(result, str):
+        msg = f"{job}: needs entry must contain a string result."
+        raise ValueError(msg)
+
+    return result
+
+
+def collect_failures(needs: dict[str, Any], allowed_skips: set[str]) -> list[str]:
+    failures: list[str] = []
+    for job, value in needs.items():
+        result = result_for_job(job, value)
+        if result == SUCCESS:
+            continue
+        if result == SKIPPED and job in allowed_skips:
+            print(f"::notice title=Allowed skipped job::{job}")
+            continue
+        failures.append(f"{job}={result}")
+    return failures
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Fail unless all required GitHub Actions needs succeeded."
+    )
+    parser.add_argument("--needs-json", required=True)
+    parser.add_argument("--allowed-skips", default="")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    needs = parse_needs(args.needs_json)
+    allowed_skips = parse_allowed_skips(args.allowed_skips)
+    failures = collect_failures(needs, allowed_skips)
+    if failures:
+        print(f"::error title=Required jobs failed::{', '.join(failures)}")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,191 @@
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+from pathlib import Path
+
+ML_MARKERS = ("ml_ocr", "ml_pdf_model", "ml_vlm", "ml_asr")
+CROSS_PLATFORM_MARKER = "cross_platform"
+CI_FILE_MARKERS = (*ML_MARKERS, CROSS_PLATFORM_MARKER)
+SUITE_MARKERS = {
+    "ocr": "ml_ocr",
+    "pdf-model": "ml_pdf_model",
+    "vlm": "ml_vlm",
+    "asr": "ml_asr",
+}
+MARKER_SUITES = {marker: suite for suite, marker in SUITE_MARKERS.items()}
+
+
+def parse_bool(value: str) -> bool:
+    return value.lower() == "true"
+
+
+def is_pytest_mark_attribute(node: ast.AST, marker: str) -> bool:
+    if not isinstance(node, ast.Attribute) or node.attr != marker:
+        return False
+    if not isinstance(node.value, ast.Attribute) or node.value.attr != "mark":
+        return False
+    return isinstance(node.value.value, ast.Name) and node.value.value.id == "pytest"
+
+
+def markers_in_node(node: ast.AST) -> set[str]:
+    markers: set[str] = set()
+    for child in ast.walk(node):
+        for marker in CI_FILE_MARKERS:
+            if is_pytest_mark_attribute(child, marker):
+                markers.add(marker)
+    return markers
+
+
+def module_level_ci_markers(tree: ast.Module) -> set[str]:
+    markers: set[str] = set()
+    for statement in tree.body:
+        value: ast.AST | None = None
+        if isinstance(statement, ast.Assign) and any(
+            isinstance(target, ast.Name) and target.id == "pytestmark"
+            for target in statement.targets
+        ):
+            value = statement.value
+        elif (
+            isinstance(statement, ast.AnnAssign)
+            and isinstance(statement.target, ast.Name)
+            and statement.target.id == "pytestmark"
+        ):
+            value = statement.value
+
+        if value is not None:
+            markers.update(markers_in_node(value))
+
+    return markers
+
+
+def detect_ci_markers(path: Path) -> set[str]:
+    if not path.exists() or path.suffix != ".py":
+        return set()
+
+    tree = ast.parse(path.read_text(encoding="utf-8"), filename=str(path))
+    all_markers = markers_in_node(tree)
+    module_markers = module_level_ci_markers(tree)
+    if all_markers != module_markers:
+        raise ValueError(
+            f"{path}: CI pytest markers must be declared with module-level "
+            "`pytestmark` so CI can select whole test modules."
+        )
+    return module_markers
+
+
+def detect_ml_markers(path: Path) -> set[str]:
+    return detect_ci_markers(path) & set(ML_MARKERS)
+
+
+def discover_test_markers(repo_root: Path) -> dict[str, list[Path]]:
+    discovered: dict[str, list[Path]] = {marker: [] for marker in CI_FILE_MARKERS}
+    tests_dir = repo_root / "tests"
+    if not tests_dir.exists():
+        return discovered
+
+    for path in sorted(tests_dir.rglob("*.py")):
+        markers = detect_ci_markers(path)
+        for marker in markers:
+            discovered[marker].append(path.relative_to(repo_root))
+
+    return discovered
+
+
+def build_ml_suites(*, run_all_ml: bool) -> list[str]:
+    if not run_all_ml:
+        return []
+
+    return [MARKER_SUITES[marker] for marker in ML_MARKERS]
+
+
+def write_github_output(name: str, value: str) -> None:
+    output_path = os.environ.get("GITHUB_OUTPUT")
+    if output_path is None:
+        print(f"{name}={value}")
+        return
+
+    with Path(output_path).open("a", encoding="utf-8") as output_file:
+        output_file.write(f"{name}={value}\n")
+
+
+def print_paths(paths: list[Path]) -> None:
+    for path in paths:
+        print(path.as_posix())
+
+
+def run_matrix(args: argparse.Namespace) -> None:
+    suites = build_ml_suites(run_all_ml=parse_bool(args.run_all_ml))
+    write_github_output("ml_suites", json.dumps(suites, separators=(",", ":")))
+
+
+def run_core_ignore_args(args: argparse.Namespace) -> None:
+    discovered = discover_test_markers(args.repo_root)
+    marked_paths = sorted(
+        {path for marker in ML_MARKERS for path in discovered[marker]}
+    )
+    for path in marked_paths:
+        print(f"--ignore={path.as_posix()}")
+
+
+def run_suite_args(args: argparse.Namespace) -> None:
+    if args.suite not in SUITE_MARKERS:
+        raise ValueError(f"Unknown ML suite: {args.suite}")
+
+    discovered = discover_test_markers(args.repo_root)
+    print_paths(discovered[SUITE_MARKERS[args.suite]])
+
+
+def run_suite_marker(args: argparse.Namespace) -> None:
+    if args.suite not in SUITE_MARKERS:
+        raise ValueError(f"Unknown ML suite: {args.suite}")
+
+    print(SUITE_MARKERS[args.suite])
+
+
+def run_marker_args(args: argparse.Namespace) -> None:
+    if args.marker not in CI_FILE_MARKERS:
+        raise ValueError(f"Unknown CI marker: {args.marker}")
+
+    discovered = discover_test_markers(args.repo_root)
+    print_paths(discovered[args.marker])
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Select pytest modules for Docling's marker-based CI lanes."
+    )
+    parser.add_argument("--repo-root", type=Path, default=Path("."))
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    matrix_parser = subparsers.add_parser("matrix")
+    matrix_parser.add_argument("--run-all-ml", default="false")
+    matrix_parser.set_defaults(func=run_matrix)
+
+    core_parser = subparsers.add_parser("core-ignore-args")
+    core_parser.set_defaults(func=run_core_ignore_args)
+
+    suite_parser = subparsers.add_parser("suite-args")
+    suite_parser.add_argument("suite")
+    suite_parser.set_defaults(func=run_suite_args)
+
+    marker_parser = subparsers.add_parser("suite-marker")
+    marker_parser.add_argument("suite")
+    marker_parser.set_defaults(func=run_suite_marker)
+
+    marker_args_parser = subparsers.add_parser("marker-args")
+    marker_args_parser.add_argument("marker")
+    marker_args_parser.set_defaults(func=run_marker_args)
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()